diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index b234f013485d..3ae2091f4a0f 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -1,1173 +1,1173 @@ //===--- ARM.cpp - Implement ARM target feature support -------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements ARM TargetInfo objects. // //===----------------------------------------------------------------------===// #include "ARM.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/TargetBuiltins.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" using namespace clang; using namespace clang::targets; void ARMTargetInfo::setABIAAPCS() { IsAAPCS = true; DoubleAlign = LongLongAlign = LongDoubleAlign = SuitableAlign = 64; const llvm::Triple &T = getTriple(); bool IsNetBSD = T.isOSNetBSD(); bool IsOpenBSD = T.isOSOpenBSD(); if (!T.isOSWindows() && !IsNetBSD && !IsOpenBSD) WCharType = UnsignedInt; UseBitFieldTypeAlignment = true; ZeroLengthBitfieldBoundary = 0; // Thumb1 add sp, #imm requires the immediate value be multiple of 4, // so set preferred for small types to 32. if (T.isOSBinFormatMachO()) { resetDataLayout(BigEndian ? "E-m:o-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" : "e-m:o-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"); } else if (T.isOSWindows()) { assert(!BigEndian && "Windows on ARM does not support big endian"); resetDataLayout("e" "-m:w" "-p:32:32" "-Fi8" "-i64:64" "-v128:64:128" "-a:0:32" "-n32" "-S64"); } else if (T.isOSNaCl()) { assert(!BigEndian && "NaCl on ARM does not support big endian"); resetDataLayout("e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S128"); } else { resetDataLayout(BigEndian ? "E-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" : "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"); } // FIXME: Enumerated types are variable width in straight AAPCS. } void ARMTargetInfo::setABIAPCS(bool IsAAPCS16) { const llvm::Triple &T = getTriple(); IsAAPCS = false; if (IsAAPCS16) DoubleAlign = LongLongAlign = LongDoubleAlign = SuitableAlign = 64; else DoubleAlign = LongLongAlign = LongDoubleAlign = SuitableAlign = 32; WCharType = SignedInt; // Do not respect the alignment of bit-field types when laying out // structures. This corresponds to PCC_BITFIELD_TYPE_MATTERS in gcc. UseBitFieldTypeAlignment = false; /// gcc forces the alignment to 4 bytes, regardless of the type of the /// zero length bitfield. This corresponds to EMPTY_FIELD_BOUNDARY in /// gcc. ZeroLengthBitfieldBoundary = 32; if (T.isOSBinFormatMachO() && IsAAPCS16) { assert(!BigEndian && "AAPCS16 does not support big-endian"); resetDataLayout("e-m:o-p:32:32-Fi8-i64:64-a:0:32-n32-S128"); } else if (T.isOSBinFormatMachO()) resetDataLayout( BigEndian ? "E-m:o-p:32:32-Fi8-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" : "e-m:o-p:32:32-Fi8-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"); else resetDataLayout( BigEndian ? "E-m:e-p:32:32-Fi8-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" : "e-m:e-p:32:32-Fi8-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"); // FIXME: Override "preferred align" for double and long long. } void ARMTargetInfo::setArchInfo() { StringRef ArchName = getTriple().getArchName(); ArchISA = llvm::ARM::parseArchISA(ArchName); CPU = llvm::ARM::getDefaultCPU(ArchName); llvm::ARM::ArchKind AK = llvm::ARM::parseArch(ArchName); if (AK != llvm::ARM::ArchKind::INVALID) ArchKind = AK; setArchInfo(ArchKind); } void ARMTargetInfo::setArchInfo(llvm::ARM::ArchKind Kind) { StringRef SubArch; // cache TargetParser info ArchKind = Kind; SubArch = llvm::ARM::getSubArch(ArchKind); ArchProfile = llvm::ARM::parseArchProfile(SubArch); ArchVersion = llvm::ARM::parseArchVersion(SubArch); // cache CPU related strings CPUAttr = getCPUAttr(); CPUProfile = getCPUProfile(); } void ARMTargetInfo::setAtomic() { // when triple does not specify a sub arch, // then we are not using inline atomics bool ShouldUseInlineAtomic = (ArchISA == llvm::ARM::ISAKind::ARM && ArchVersion >= 6) || (ArchISA == llvm::ARM::ISAKind::THUMB && ArchVersion >= 7); // Cortex M does not support 8 byte atomics, while general Thumb2 does. if (ArchProfile == llvm::ARM::ProfileKind::M) { MaxAtomicPromoteWidth = 32; if (ShouldUseInlineAtomic) MaxAtomicInlineWidth = 32; } else { MaxAtomicPromoteWidth = 64; if (ShouldUseInlineAtomic) MaxAtomicInlineWidth = 64; } } bool ARMTargetInfo::hasMVE() const { return ArchKind == llvm::ARM::ArchKind::ARMV8_1MMainline && MVE != 0; } bool ARMTargetInfo::hasMVEFloat() const { return hasMVE() && (MVE & MVE_FP); } bool ARMTargetInfo::isThumb() const { return ArchISA == llvm::ARM::ISAKind::THUMB; } bool ARMTargetInfo::supportsThumb() const { return CPUAttr.count('T') || ArchVersion >= 6; } bool ARMTargetInfo::supportsThumb2() const { return CPUAttr.equals("6T2") || (ArchVersion >= 7 && !CPUAttr.equals("8M_BASE")); } StringRef ARMTargetInfo::getCPUAttr() const { // For most sub-arches, the build attribute CPU name is enough. // For Cortex variants, it's slightly different. switch (ArchKind) { default: return llvm::ARM::getCPUAttr(ArchKind); case llvm::ARM::ArchKind::ARMV6M: return "6M"; case llvm::ARM::ArchKind::ARMV7S: return "7S"; case llvm::ARM::ArchKind::ARMV7A: return "7A"; case llvm::ARM::ArchKind::ARMV7R: return "7R"; case llvm::ARM::ArchKind::ARMV7M: return "7M"; case llvm::ARM::ArchKind::ARMV7EM: return "7EM"; case llvm::ARM::ArchKind::ARMV7VE: return "7VE"; case llvm::ARM::ArchKind::ARMV8A: return "8A"; case llvm::ARM::ArchKind::ARMV8_1A: return "8_1A"; case llvm::ARM::ArchKind::ARMV8_2A: return "8_2A"; case llvm::ARM::ArchKind::ARMV8_3A: return "8_3A"; case llvm::ARM::ArchKind::ARMV8_4A: return "8_4A"; case llvm::ARM::ArchKind::ARMV8_5A: return "8_5A"; case llvm::ARM::ArchKind::ARMV8MBaseline: return "8M_BASE"; case llvm::ARM::ArchKind::ARMV8MMainline: return "8M_MAIN"; case llvm::ARM::ArchKind::ARMV8R: return "8R"; case llvm::ARM::ArchKind::ARMV8_1MMainline: return "8_1M_MAIN"; } } StringRef ARMTargetInfo::getCPUProfile() const { switch (ArchProfile) { case llvm::ARM::ProfileKind::A: return "A"; case llvm::ARM::ProfileKind::R: return "R"; case llvm::ARM::ProfileKind::M: return "M"; default: return ""; } } ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : TargetInfo(Triple), FPMath(FP_Default), IsAAPCS(true), LDREX(0), HW_FP(0) { bool IsOpenBSD = Triple.isOSOpenBSD(); bool IsNetBSD = Triple.isOSNetBSD(); // FIXME: the isOSBinFormatMachO is a workaround for identifying a Darwin-like // environment where size_t is `unsigned long` rather than `unsigned int` PtrDiffType = IntPtrType = (Triple.isOSDarwin() || Triple.isOSBinFormatMachO() || IsOpenBSD || IsNetBSD) ? SignedLong : SignedInt; SizeType = (Triple.isOSDarwin() || Triple.isOSBinFormatMachO() || IsOpenBSD || IsNetBSD) ? UnsignedLong : UnsignedInt; // ptrdiff_t is inconsistent on Darwin if ((Triple.isOSDarwin() || Triple.isOSBinFormatMachO()) && !Triple.isWatchABI()) PtrDiffType = SignedInt; // Cache arch related info. setArchInfo(); // {} in inline assembly are neon specifiers, not assembly variant // specifiers. NoAsmVariants = true; // FIXME: This duplicates code from the driver that sets the -target-abi // option - this code is used if -target-abi isn't passed and should // be unified in some way. if (Triple.isOSBinFormatMachO()) { // The backend is hardwired to assume AAPCS for M-class processors, ensure // the frontend matches that. if (Triple.getEnvironment() == llvm::Triple::EABI || Triple.getOS() == llvm::Triple::UnknownOS || ArchProfile == llvm::ARM::ProfileKind::M) { setABI("aapcs"); } else if (Triple.isWatchABI()) { setABI("aapcs16"); } else { setABI("apcs-gnu"); } } else if (Triple.isOSWindows()) { // FIXME: this is invalid for WindowsCE setABI("aapcs"); } else { // Select the default based on the platform. switch (Triple.getEnvironment()) { case llvm::Triple::Android: case llvm::Triple::GNUEABI: case llvm::Triple::GNUEABIHF: case llvm::Triple::MuslEABI: case llvm::Triple::MuslEABIHF: setABI("aapcs-linux"); break; case llvm::Triple::EABIHF: case llvm::Triple::EABI: setABI("aapcs"); break; case llvm::Triple::GNU: setABI("apcs-gnu"); break; default: if (IsNetBSD) setABI("apcs-gnu"); else if (IsOpenBSD) setABI("aapcs-linux"); else setABI("aapcs"); break; } } // ARM targets default to using the ARM C++ ABI. TheCXXABI.set(TargetCXXABI::GenericARM); // ARM has atomics up to 8 bytes setAtomic(); // Maximum alignment for ARM NEON data types should be 64-bits (AAPCS) // as well the default alignment if (IsAAPCS && (Triple.getEnvironment() != llvm::Triple::Android)) DefaultAlignForAttributeAligned = MaxVectorAlign = 64; // Do force alignment of members that follow zero length bitfields. If // the alignment of the zero-length bitfield is greater than the member // that follows it, `bar', `bar' will be aligned as the type of the // zero length bitfield. UseZeroLengthBitfieldAlignment = true; if (Triple.getOS() == llvm::Triple::Linux || Triple.getOS() == llvm::Triple::UnknownOS) this->MCountName = Opts.EABIVersion == llvm::EABI::GNU - ? "llvm.arm.gnu.eabi.mcount" + ? "\01__gnu_mcount_nc" : "\01mcount"; SoftFloatABI = llvm::is_contained(Opts.FeaturesAsWritten, "+soft-float-abi"); } StringRef ARMTargetInfo::getABI() const { return ABI; } bool ARMTargetInfo::setABI(const std::string &Name) { ABI = Name; // The defaults (above) are for AAPCS, check if we need to change them. // // FIXME: We need support for -meabi... we could just mangle it into the // name. if (Name == "apcs-gnu" || Name == "aapcs16") { setABIAPCS(Name == "aapcs16"); return true; } if (Name == "aapcs" || Name == "aapcs-vfp" || Name == "aapcs-linux") { setABIAAPCS(); return true; } return false; } // FIXME: This should be based on Arch attributes, not CPU names. bool ARMTargetInfo::initFeatureMap( llvm::StringMap &Features, DiagnosticsEngine &Diags, StringRef CPU, const std::vector &FeaturesVec) const { std::string ArchFeature; std::vector TargetFeatures; llvm::ARM::ArchKind Arch = llvm::ARM::parseArch(getTriple().getArchName()); // Map the base architecture to an appropriate target feature, so we don't // rely on the target triple. llvm::ARM::ArchKind CPUArch = llvm::ARM::parseCPUArch(CPU); if (CPUArch == llvm::ARM::ArchKind::INVALID) CPUArch = Arch; if (CPUArch != llvm::ARM::ArchKind::INVALID) { ArchFeature = ("+" + llvm::ARM::getArchName(CPUArch)).str(); TargetFeatures.push_back(ArchFeature); } // get default FPU features unsigned FPUKind = llvm::ARM::getDefaultFPU(CPU, Arch); llvm::ARM::getFPUFeatures(FPUKind, TargetFeatures); // get default Extension features unsigned Extensions = llvm::ARM::getDefaultExtensions(CPU, Arch); llvm::ARM::getExtensionFeatures(Extensions, TargetFeatures); for (auto Feature : TargetFeatures) if (Feature[0] == '+') Features[Feature.drop_front(1)] = true; // Enable or disable thumb-mode explicitly per function to enable mixed // ARM and Thumb code generation. if (isThumb()) Features["thumb-mode"] = true; else Features["thumb-mode"] = false; // Convert user-provided arm and thumb GNU target attributes to // [-|+]thumb-mode target features respectively. std::vector UpdatedFeaturesVec; for (const auto &Feature : FeaturesVec) { // Skip soft-float-abi; it's something we only use to initialize a bit of // class state, and is otherwise unrecognized. if (Feature == "+soft-float-abi") continue; StringRef FixedFeature; if (Feature == "+arm") FixedFeature = "-thumb-mode"; else if (Feature == "+thumb") FixedFeature = "+thumb-mode"; else FixedFeature = Feature; UpdatedFeaturesVec.push_back(FixedFeature.str()); } return TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec); } bool ARMTargetInfo::handleTargetFeatures(std::vector &Features, DiagnosticsEngine &Diags) { FPU = 0; MVE = 0; CRC = 0; Crypto = 0; DSP = 0; Unaligned = 1; SoftFloat = false; // Note that SoftFloatABI is initialized in our constructor. HWDiv = 0; DotProd = 0; HasFloat16 = true; // This does not diagnose illegal cases like having both // "+vfpv2" and "+vfpv3" or having "+neon" and "-fp64". for (const auto &Feature : Features) { if (Feature == "+soft-float") { SoftFloat = true; } else if (Feature == "+vfp2sp" || Feature == "+vfp2d16sp" || Feature == "+vfp2" || Feature == "+vfp2d16") { FPU |= VFP2FPU; HW_FP |= HW_FP_SP; if (Feature == "+vfp2" || Feature == "+vfp2d16") HW_FP |= HW_FP_DP; } else if (Feature == "+vfp3sp" || Feature == "+vfp3d16sp" || Feature == "+vfp3" || Feature == "+vfp3d16") { FPU |= VFP3FPU; HW_FP |= HW_FP_SP; if (Feature == "+vfp3" || Feature == "+vfp3d16") HW_FP |= HW_FP_DP; } else if (Feature == "+vfp4sp" || Feature == "+vfp4d16sp" || Feature == "+vfp4" || Feature == "+vfp4d16") { FPU |= VFP4FPU; HW_FP |= HW_FP_SP | HW_FP_HP; if (Feature == "+vfp4" || Feature == "+vfp4d16") HW_FP |= HW_FP_DP; } else if (Feature == "+fp-armv8sp" || Feature == "+fp-armv8d16sp" || Feature == "+fp-armv8" || Feature == "+fp-armv8d16") { FPU |= FPARMV8; HW_FP |= HW_FP_SP | HW_FP_HP; if (Feature == "+fp-armv8" || Feature == "+fp-armv8d16") HW_FP |= HW_FP_DP; } else if (Feature == "+neon") { FPU |= NeonFPU; HW_FP |= HW_FP_SP; } else if (Feature == "+hwdiv") { HWDiv |= HWDivThumb; } else if (Feature == "+hwdiv-arm") { HWDiv |= HWDivARM; } else if (Feature == "+crc") { CRC = 1; } else if (Feature == "+crypto") { Crypto = 1; } else if (Feature == "+dsp") { DSP = 1; } else if (Feature == "+fp64") { HW_FP |= HW_FP_DP; } else if (Feature == "+8msecext") { if (CPUProfile != "M" || ArchVersion != 8) { Diags.Report(diag::err_target_unsupported_mcmse) << CPU; return false; } } else if (Feature == "+strict-align") { Unaligned = 0; } else if (Feature == "+fp16") { HW_FP |= HW_FP_HP; } else if (Feature == "+fullfp16") { HasLegalHalfType = true; } else if (Feature == "+dotprod") { DotProd = true; } else if (Feature == "+mve") { DSP = 1; MVE |= MVE_INT; } else if (Feature == "+mve.fp") { DSP = 1; HasLegalHalfType = true; FPU |= FPARMV8; MVE |= MVE_INT | MVE_FP; HW_FP |= HW_FP_SP | HW_FP_HP; } } switch (ArchVersion) { case 6: if (ArchProfile == llvm::ARM::ProfileKind::M) LDREX = 0; else if (ArchKind == llvm::ARM::ArchKind::ARMV6K) LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B; else LDREX = LDREX_W; break; case 7: if (ArchProfile == llvm::ARM::ProfileKind::M) LDREX = LDREX_W | LDREX_H | LDREX_B; else LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B; break; case 8: LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B; } if (!(FPU & NeonFPU) && FPMath == FP_Neon) { Diags.Report(diag::err_target_unsupported_fpmath) << "neon"; return false; } if (FPMath == FP_Neon) Features.push_back("+neonfp"); else if (FPMath == FP_VFP) Features.push_back("-neonfp"); return true; } bool ARMTargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch(Feature) .Case("arm", true) .Case("aarch32", true) .Case("softfloat", SoftFloat) .Case("thumb", isThumb()) .Case("neon", (FPU & NeonFPU) && !SoftFloat) .Case("vfp", FPU && !SoftFloat) .Case("hwdiv", HWDiv & HWDivThumb) .Case("hwdiv-arm", HWDiv & HWDivARM) .Case("mve", hasMVE()) .Default(false); } bool ARMTargetInfo::isValidCPUName(StringRef Name) const { return Name == "generic" || llvm::ARM::parseCPUArch(Name) != llvm::ARM::ArchKind::INVALID; } void ARMTargetInfo::fillValidCPUList(SmallVectorImpl &Values) const { llvm::ARM::fillValidCPUArchList(Values); } bool ARMTargetInfo::setCPU(const std::string &Name) { if (Name != "generic") setArchInfo(llvm::ARM::parseCPUArch(Name)); if (ArchKind == llvm::ARM::ArchKind::INVALID) return false; setAtomic(); CPU = Name; return true; } bool ARMTargetInfo::setFPMath(StringRef Name) { if (Name == "neon") { FPMath = FP_Neon; return true; } else if (Name == "vfp" || Name == "vfp2" || Name == "vfp3" || Name == "vfp4") { FPMath = FP_VFP; return true; } return false; } void ARMTargetInfo::getTargetDefinesARMV81A(const LangOptions &Opts, MacroBuilder &Builder) const { Builder.defineMacro("__ARM_FEATURE_QRDMX", "1"); } void ARMTargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts, MacroBuilder &Builder) const { // Also include the ARMv8.1-A defines getTargetDefinesARMV81A(Opts, Builder); } void ARMTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { // Target identification. Builder.defineMacro("__arm"); Builder.defineMacro("__arm__"); // For bare-metal none-eabi. if (getTriple().getOS() == llvm::Triple::UnknownOS && (getTriple().getEnvironment() == llvm::Triple::EABI || getTriple().getEnvironment() == llvm::Triple::EABIHF)) Builder.defineMacro("__ELF__"); // Target properties. Builder.defineMacro("__REGISTER_PREFIX__", ""); // Unfortunately, __ARM_ARCH_7K__ is now more of an ABI descriptor. The CPU // happens to be Cortex-A7 though, so it should still get __ARM_ARCH_7A__. if (getTriple().isWatchABI()) Builder.defineMacro("__ARM_ARCH_7K__", "2"); if (!CPUAttr.empty()) Builder.defineMacro("__ARM_ARCH_" + CPUAttr + "__"); // ACLE 6.4.1 ARM/Thumb instruction set architecture // __ARM_ARCH is defined as an integer value indicating the current ARM ISA Builder.defineMacro("__ARM_ARCH", Twine(ArchVersion)); if (ArchVersion >= 8) { // ACLE 6.5.7 Crypto Extension if (Crypto) Builder.defineMacro("__ARM_FEATURE_CRYPTO", "1"); // ACLE 6.5.8 CRC32 Extension if (CRC) Builder.defineMacro("__ARM_FEATURE_CRC32", "1"); // ACLE 6.5.10 Numeric Maximum and Minimum Builder.defineMacro("__ARM_FEATURE_NUMERIC_MAXMIN", "1"); // ACLE 6.5.9 Directed Rounding Builder.defineMacro("__ARM_FEATURE_DIRECTED_ROUNDING", "1"); } // __ARM_ARCH_ISA_ARM is defined to 1 if the core supports the ARM ISA. It // is not defined for the M-profile. // NOTE that the default profile is assumed to be 'A' if (CPUProfile.empty() || ArchProfile != llvm::ARM::ProfileKind::M) Builder.defineMacro("__ARM_ARCH_ISA_ARM", "1"); // __ARM_ARCH_ISA_THUMB is defined to 1 if the core supports the original // Thumb ISA (including v6-M and v8-M Baseline). It is set to 2 if the // core supports the Thumb-2 ISA as found in the v6T2 architecture and all // v7 and v8 architectures excluding v8-M Baseline. if (supportsThumb2()) Builder.defineMacro("__ARM_ARCH_ISA_THUMB", "2"); else if (supportsThumb()) Builder.defineMacro("__ARM_ARCH_ISA_THUMB", "1"); // __ARM_32BIT_STATE is defined to 1 if code is being generated for a 32-bit // instruction set such as ARM or Thumb. Builder.defineMacro("__ARM_32BIT_STATE", "1"); // ACLE 6.4.2 Architectural Profile (A, R, M or pre-Cortex) // __ARM_ARCH_PROFILE is defined as 'A', 'R', 'M' or 'S', or unset. if (!CPUProfile.empty()) Builder.defineMacro("__ARM_ARCH_PROFILE", "'" + CPUProfile + "'"); // ACLE 6.4.3 Unaligned access supported in hardware if (Unaligned) Builder.defineMacro("__ARM_FEATURE_UNALIGNED", "1"); // ACLE 6.4.4 LDREX/STREX if (LDREX) Builder.defineMacro("__ARM_FEATURE_LDREX", "0x" + Twine::utohexstr(LDREX)); // ACLE 6.4.5 CLZ if (ArchVersion == 5 || (ArchVersion == 6 && CPUProfile != "M") || ArchVersion > 6) Builder.defineMacro("__ARM_FEATURE_CLZ", "1"); // ACLE 6.5.1 Hardware Floating Point if (HW_FP) Builder.defineMacro("__ARM_FP", "0x" + Twine::utohexstr(HW_FP)); // ACLE predefines. Builder.defineMacro("__ARM_ACLE", "200"); // FP16 support (we currently only support IEEE format). Builder.defineMacro("__ARM_FP16_FORMAT_IEEE", "1"); Builder.defineMacro("__ARM_FP16_ARGS", "1"); // ACLE 6.5.3 Fused multiply-accumulate (FMA) if (ArchVersion >= 7 && (FPU & VFP4FPU)) Builder.defineMacro("__ARM_FEATURE_FMA", "1"); // Subtarget options. // FIXME: It's more complicated than this and we don't really support // interworking. // Windows on ARM does not "support" interworking if (5 <= ArchVersion && ArchVersion <= 8 && !getTriple().isOSWindows()) Builder.defineMacro("__THUMB_INTERWORK__"); if (ABI == "aapcs" || ABI == "aapcs-linux" || ABI == "aapcs-vfp") { // Embedded targets on Darwin follow AAPCS, but not EABI. // Windows on ARM follows AAPCS VFP, but does not conform to EABI. if (!getTriple().isOSBinFormatMachO() && !getTriple().isOSWindows()) Builder.defineMacro("__ARM_EABI__"); Builder.defineMacro("__ARM_PCS", "1"); } if ((!SoftFloat && !SoftFloatABI) || ABI == "aapcs-vfp" || ABI == "aapcs16") Builder.defineMacro("__ARM_PCS_VFP", "1"); if (SoftFloat) Builder.defineMacro("__SOFTFP__"); // ACLE position independent code macros. if (Opts.ROPI) Builder.defineMacro("__ARM_ROPI", "1"); if (Opts.RWPI) Builder.defineMacro("__ARM_RWPI", "1"); if (ArchKind == llvm::ARM::ArchKind::XSCALE) Builder.defineMacro("__XSCALE__"); if (isThumb()) { Builder.defineMacro("__THUMBEL__"); Builder.defineMacro("__thumb__"); if (supportsThumb2()) Builder.defineMacro("__thumb2__"); } // ACLE 6.4.9 32-bit SIMD instructions if ((CPUProfile != "M" && ArchVersion >= 6) || (CPUProfile == "M" && DSP)) Builder.defineMacro("__ARM_FEATURE_SIMD32", "1"); // ACLE 6.4.10 Hardware Integer Divide if (((HWDiv & HWDivThumb) && isThumb()) || ((HWDiv & HWDivARM) && !isThumb())) { Builder.defineMacro("__ARM_FEATURE_IDIV", "1"); Builder.defineMacro("__ARM_ARCH_EXT_IDIV__", "1"); } // Note, this is always on in gcc, even though it doesn't make sense. Builder.defineMacro("__APCS_32__"); if (FPUModeIsVFP((FPUMode)FPU)) { Builder.defineMacro("__VFP_FP__"); if (FPU & VFP2FPU) Builder.defineMacro("__ARM_VFPV2__"); if (FPU & VFP3FPU) Builder.defineMacro("__ARM_VFPV3__"); if (FPU & VFP4FPU) Builder.defineMacro("__ARM_VFPV4__"); if (FPU & FPARMV8) Builder.defineMacro("__ARM_FPV5__"); } // This only gets set when Neon instructions are actually available, unlike // the VFP define, hence the soft float and arch check. This is subtly // different from gcc, we follow the intent which was that it should be set // when Neon instructions are actually available. if ((FPU & NeonFPU) && !SoftFloat && ArchVersion >= 7) { Builder.defineMacro("__ARM_NEON", "1"); Builder.defineMacro("__ARM_NEON__"); // current AArch32 NEON implementations do not support double-precision // floating-point even when it is present in VFP. Builder.defineMacro("__ARM_NEON_FP", "0x" + Twine::utohexstr(HW_FP & ~HW_FP_DP)); } if (hasMVE()) { Builder.defineMacro("__ARM_FEATURE_MVE", hasMVEFloat() ? "3" : "1"); } Builder.defineMacro("__ARM_SIZEOF_WCHAR_T", Twine(Opts.WCharSize ? Opts.WCharSize : 4)); Builder.defineMacro("__ARM_SIZEOF_MINIMAL_ENUM", Opts.ShortEnums ? "1" : "4"); // CMSE if (ArchVersion == 8 && ArchProfile == llvm::ARM::ProfileKind::M) Builder.defineMacro("__ARM_FEATURE_CMSE", Opts.Cmse ? "3" : "1"); if (ArchVersion >= 6 && CPUAttr != "6M" && CPUAttr != "8M_BASE") { Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8"); } // ACLE 6.4.7 DSP instructions if (DSP) { Builder.defineMacro("__ARM_FEATURE_DSP", "1"); } // ACLE 6.4.8 Saturation instructions bool SAT = false; if ((ArchVersion == 6 && CPUProfile != "M") || ArchVersion > 6) { Builder.defineMacro("__ARM_FEATURE_SAT", "1"); SAT = true; } // ACLE 6.4.6 Q (saturation) flag if (DSP || SAT) Builder.defineMacro("__ARM_FEATURE_QBIT", "1"); if (Opts.UnsafeFPMath) Builder.defineMacro("__ARM_FP_FAST", "1"); // Armv8.2-A FP16 vector intrinsic if ((FPU & NeonFPU) && HasLegalHalfType) Builder.defineMacro("__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", "1"); // Armv8.2-A FP16 scalar intrinsics if (HasLegalHalfType) Builder.defineMacro("__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", "1"); // Armv8.2-A dot product intrinsics if (DotProd) Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1"); switch (ArchKind) { default: break; case llvm::ARM::ArchKind::ARMV8_1A: getTargetDefinesARMV81A(Opts, Builder); break; case llvm::ARM::ArchKind::ARMV8_2A: getTargetDefinesARMV82A(Opts, Builder); break; } } const Builtin::Info ARMTargetInfo::BuiltinInfo[] = { #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \ {#ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr}, #include "clang/Basic/BuiltinsNEON.def" #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \ {#ID, TYPE, ATTRS, nullptr, LANG, nullptr}, #define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \ {#ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr}, #define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \ {#ID, TYPE, ATTRS, HEADER, LANGS, FEATURE}, #include "clang/Basic/BuiltinsARM.def" }; ArrayRef ARMTargetInfo::getTargetBuiltins() const { return llvm::makeArrayRef(BuiltinInfo, clang::ARM::LastTSBuiltin - Builtin::FirstTSBuiltin); } bool ARMTargetInfo::isCLZForZeroUndef() const { return false; } TargetInfo::BuiltinVaListKind ARMTargetInfo::getBuiltinVaListKind() const { return IsAAPCS ? AAPCSABIBuiltinVaList : (getTriple().isWatchABI() ? TargetInfo::CharPtrBuiltinVaList : TargetInfo::VoidPtrBuiltinVaList); } const char *const ARMTargetInfo::GCCRegNames[] = { // Integer registers "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc", // Float registers "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", // Double registers "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", // Quad registers "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"}; ArrayRef ARMTargetInfo::getGCCRegNames() const { return llvm::makeArrayRef(GCCRegNames); } const TargetInfo::GCCRegAlias ARMTargetInfo::GCCRegAliases[] = { {{"a1"}, "r0"}, {{"a2"}, "r1"}, {{"a3"}, "r2"}, {{"a4"}, "r3"}, {{"v1"}, "r4"}, {{"v2"}, "r5"}, {{"v3"}, "r6"}, {{"v4"}, "r7"}, {{"v5"}, "r8"}, {{"v6", "rfp"}, "r9"}, {{"sl"}, "r10"}, {{"fp"}, "r11"}, {{"ip"}, "r12"}, {{"r13"}, "sp"}, {{"r14"}, "lr"}, {{"r15"}, "pc"}, // The S, D and Q registers overlap, but aren't really aliases; we // don't want to substitute one of these for a different-sized one. }; ArrayRef ARMTargetInfo::getGCCRegAliases() const { return llvm::makeArrayRef(GCCRegAliases); } bool ARMTargetInfo::validateAsmConstraint( const char *&Name, TargetInfo::ConstraintInfo &Info) const { switch (*Name) { default: break; case 'l': // r0-r7 case 'h': // r8-r15 case 't': // VFP Floating point register single precision case 'w': // VFP Floating point register double precision Info.setAllowsRegister(); return true; case 'I': case 'J': case 'K': case 'L': case 'M': // FIXME return true; case 'Q': // A memory address that is a single base register. Info.setAllowsMemory(); return true; case 'T': switch (Name[1]) { default: break; case 'e': // Even general-purpose register case 'o': // Odd general-purpose register Info.setAllowsRegister(); Name++; return true; } break; case 'U': // a memory reference... switch (Name[1]) { case 'q': // ...ARMV4 ldrsb case 'v': // ...VFP load/store (reg+constant offset) case 'y': // ...iWMMXt load/store case 't': // address valid for load/store opaque types wider // than 128-bits case 'n': // valid address for Neon doubleword vector load/store case 'm': // valid address for Neon element and structure load/store case 's': // valid address for non-offset loads/stores of quad-word // values in four ARM registers Info.setAllowsMemory(); Name++; return true; } break; } return false; } std::string ARMTargetInfo::convertConstraint(const char *&Constraint) const { std::string R; switch (*Constraint) { case 'U': // Two-character constraint; add "^" hint for later parsing. case 'T': R = std::string("^") + std::string(Constraint, 2); Constraint++; break; case 'p': // 'p' should be translated to 'r' by default. R = std::string("r"); break; default: return std::string(1, *Constraint); } return R; } bool ARMTargetInfo::validateConstraintModifier( StringRef Constraint, char Modifier, unsigned Size, std::string &SuggestedModifier) const { bool isOutput = (Constraint[0] == '='); bool isInOut = (Constraint[0] == '+'); // Strip off constraint modifiers. while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&') Constraint = Constraint.substr(1); switch (Constraint[0]) { default: break; case 'r': { switch (Modifier) { default: return (isInOut || isOutput || Size <= 64); case 'q': // A register of size 32 cannot fit a vector type. return false; } } } return true; } const char *ARMTargetInfo::getClobbers() const { // FIXME: Is this really right? return ""; } TargetInfo::CallingConvCheckResult ARMTargetInfo::checkCallingConvention(CallingConv CC) const { switch (CC) { case CC_AAPCS: case CC_AAPCS_VFP: case CC_Swift: case CC_OpenCLKernel: return CCCR_OK; default: return CCCR_Warning; } } int ARMTargetInfo::getEHDataRegisterNumber(unsigned RegNo) const { if (RegNo == 0) return 0; if (RegNo == 1) return 1; return -1; } bool ARMTargetInfo::hasSjLjLowering() const { return true; } ARMleTargetInfo::ARMleTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : ARMTargetInfo(Triple, Opts) {} void ARMleTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { Builder.defineMacro("__ARMEL__"); ARMTargetInfo::getTargetDefines(Opts, Builder); } ARMbeTargetInfo::ARMbeTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : ARMTargetInfo(Triple, Opts) {} void ARMbeTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { Builder.defineMacro("__ARMEB__"); Builder.defineMacro("__ARM_BIG_ENDIAN"); ARMTargetInfo::getTargetDefines(Opts, Builder); } WindowsARMTargetInfo::WindowsARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : WindowsTargetInfo(Triple, Opts), Triple(Triple) { } void WindowsARMTargetInfo::getVisualStudioDefines(const LangOptions &Opts, MacroBuilder &Builder) const { // FIXME: this is invalid for WindowsCE Builder.defineMacro("_M_ARM_NT", "1"); Builder.defineMacro("_M_ARMT", "_M_ARM"); Builder.defineMacro("_M_THUMB", "_M_ARM"); assert((Triple.getArch() == llvm::Triple::arm || Triple.getArch() == llvm::Triple::thumb) && "invalid architecture for Windows ARM target info"); unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6; Builder.defineMacro("_M_ARM", Triple.getArchName().substr(Offset)); // TODO map the complete set of values // 31: VFPv3 40: VFPv4 Builder.defineMacro("_M_ARM_FP", "31"); } TargetInfo::BuiltinVaListKind WindowsARMTargetInfo::getBuiltinVaListKind() const { return TargetInfo::CharPtrBuiltinVaList; } TargetInfo::CallingConvCheckResult WindowsARMTargetInfo::checkCallingConvention(CallingConv CC) const { switch (CC) { case CC_X86StdCall: case CC_X86ThisCall: case CC_X86FastCall: case CC_X86VectorCall: return CCCR_Ignore; case CC_C: case CC_OpenCLKernel: case CC_PreserveMost: case CC_PreserveAll: case CC_Swift: return CCCR_OK; default: return CCCR_Warning; } } // Windows ARM + Itanium C++ ABI Target ItaniumWindowsARMleTargetInfo::ItaniumWindowsARMleTargetInfo( const llvm::Triple &Triple, const TargetOptions &Opts) : WindowsARMTargetInfo(Triple, Opts) { TheCXXABI.set(TargetCXXABI::GenericARM); } void ItaniumWindowsARMleTargetInfo::getTargetDefines( const LangOptions &Opts, MacroBuilder &Builder) const { WindowsARMTargetInfo::getTargetDefines(Opts, Builder); if (Opts.MSVCCompat) WindowsARMTargetInfo::getVisualStudioDefines(Opts, Builder); } // Windows ARM, MS (C++) ABI MicrosoftARMleTargetInfo::MicrosoftARMleTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : WindowsARMTargetInfo(Triple, Opts) { TheCXXABI.set(TargetCXXABI::Microsoft); } void MicrosoftARMleTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { WindowsARMTargetInfo::getTargetDefines(Opts, Builder); WindowsARMTargetInfo::getVisualStudioDefines(Opts, Builder); } MinGWARMTargetInfo::MinGWARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : WindowsARMTargetInfo(Triple, Opts) { TheCXXABI.set(TargetCXXABI::GenericARM); } void MinGWARMTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { WindowsARMTargetInfo::getTargetDefines(Opts, Builder); Builder.defineMacro("_ARM_"); } CygwinARMTargetInfo::CygwinARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : ARMleTargetInfo(Triple, Opts) { this->WCharType = TargetInfo::UnsignedShort; TLSSupported = false; DoubleAlign = LongLongAlign = 64; resetDataLayout("e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"); } void CygwinARMTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { ARMleTargetInfo::getTargetDefines(Opts, Builder); Builder.defineMacro("_ARM_"); Builder.defineMacro("__CYGWIN__"); Builder.defineMacro("__CYGWIN32__"); DefineStd(Builder, "unix", Opts); if (Opts.CPlusPlus) Builder.defineMacro("_GNU_SOURCE"); } DarwinARMTargetInfo::DarwinARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : DarwinTargetInfo(Triple, Opts) { HasAlignMac68kSupport = true; // iOS always has 64-bit atomic instructions. // FIXME: This should be based off of the target features in // ARMleTargetInfo. MaxAtomicInlineWidth = 64; if (Triple.isWatchABI()) { // Darwin on iOS uses a variant of the ARM C++ ABI. TheCXXABI.set(TargetCXXABI::WatchOS); // BOOL should be a real boolean on the new ABI UseSignedCharForObjCBool = false; } else TheCXXABI.set(TargetCXXABI::iOS); } void DarwinARMTargetInfo::getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple, MacroBuilder &Builder) const { getDarwinDefines(Builder, Opts, Triple, PlatformName, PlatformMinVersion); } RenderScript32TargetInfo::RenderScript32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : ARMleTargetInfo(llvm::Triple("armv7", Triple.getVendorName(), Triple.getOSName(), Triple.getEnvironmentName()), Opts) { IsRenderScriptTarget = true; LongWidth = LongAlign = 64; } void RenderScript32TargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { Builder.defineMacro("__RENDERSCRIPT__"); ARMleTargetInfo::getTargetDefines(Opts, Builder); } diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 4da29ee600f6..4792af097d95 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1,786 +1,781 @@ //===- IntrinsicsARM.td - Defines ARM intrinsics -----------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines all of the ARM-specific intrinsics. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // TLS let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.". // A space-consuming intrinsic primarily for testing ARMConstantIslands. The // first argument is the number of bytes this "instruction" takes up, the second // and return value are essentially chains, used to force ordering during ISel. def int_arm_space : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>]>; // 16-bit multiplications def int_arm_smulbb : GCCBuiltin<"__builtin_arm_smulbb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smulbt : GCCBuiltin<"__builtin_arm_smulbt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smultb : GCCBuiltin<"__builtin_arm_smultb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smultt : GCCBuiltin<"__builtin_arm_smultt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smulwb : GCCBuiltin<"__builtin_arm_smulwb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smulwt : GCCBuiltin<"__builtin_arm_smulwt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // Saturating Arithmetic def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [Commutative, IntrNoMem]>; def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Accumulating multiplications def int_arm_smlabb : GCCBuiltin<"__builtin_arm_smlabb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlabt : GCCBuiltin<"__builtin_arm_smlabt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlatb : GCCBuiltin<"__builtin_arm_smlatb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlatt : GCCBuiltin<"__builtin_arm_smlatt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlawb : GCCBuiltin<"__builtin_arm_smlawb">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlawt : GCCBuiltin<"__builtin_arm_smlawt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Parallel 16-bit saturation def int_arm_ssat16 : GCCBuiltin<"__builtin_arm_ssat16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_usat16 : GCCBuiltin<"__builtin_arm_usat16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Packing and unpacking def int_arm_sxtab16 : GCCBuiltin<"__builtin_arm_sxtab16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_sxtb16 : GCCBuiltin<"__builtin_arm_sxtb16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_uxtab16 : GCCBuiltin<"__builtin_arm_uxtab16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uxtb16 : GCCBuiltin<"__builtin_arm_uxtb16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; // Parallel selection, reads the GE flags. def int_arm_sel : GCCBuiltin<"__builtin_arm_sel">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; // Parallel 8-bit addition and subtraction def int_arm_qadd8 : GCCBuiltin<"__builtin_arm_qadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qsub8 : GCCBuiltin<"__builtin_arm_qsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_sadd8 : GCCBuiltin<"__builtin_arm_sadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_shadd8 : GCCBuiltin<"__builtin_arm_shadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shsub8 : GCCBuiltin<"__builtin_arm_shsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_ssub8 : GCCBuiltin<"__builtin_arm_ssub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_uadd8 : GCCBuiltin<"__builtin_arm_uadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_uhadd8 : GCCBuiltin<"__builtin_arm_uhadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhsub8 : GCCBuiltin<"__builtin_arm_uhsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqadd8 : GCCBuiltin<"__builtin_arm_uqadd8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqsub8 : GCCBuiltin<"__builtin_arm_uqsub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_usub8 : GCCBuiltin<"__builtin_arm_usub8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Sum of 8-bit absolute differences def int_arm_usad8 : GCCBuiltin<"__builtin_arm_usad8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_usada8 : GCCBuiltin<"__builtin_arm_usada8">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Parallel 16-bit addition and subtraction def int_arm_qadd16 : GCCBuiltin<"__builtin_arm_qadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qasx : GCCBuiltin<"__builtin_arm_qasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qsax : GCCBuiltin<"__builtin_arm_qsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_qsub16 : GCCBuiltin<"__builtin_arm_qsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_sadd16 : GCCBuiltin<"__builtin_arm_sadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_sasx : GCCBuiltin<"__builtin_arm_sasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_shadd16 : GCCBuiltin<"__builtin_arm_shadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shasx : GCCBuiltin<"__builtin_arm_shasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shsax : GCCBuiltin<"__builtin_arm_shsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_shsub16 : GCCBuiltin<"__builtin_arm_shsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_ssax : GCCBuiltin<"__builtin_arm_ssax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_ssub16 : GCCBuiltin<"__builtin_arm_ssub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_uadd16 : GCCBuiltin<"__builtin_arm_uadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_uasx : GCCBuiltin<"__builtin_arm_uasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; def int_arm_uhadd16 : GCCBuiltin<"__builtin_arm_uhadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhasx : GCCBuiltin<"__builtin_arm_uhasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhsax : GCCBuiltin<"__builtin_arm_uhsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uhsub16 : GCCBuiltin<"__builtin_arm_uhsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqadd16 : GCCBuiltin<"__builtin_arm_uqadd16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqasx : GCCBuiltin<"__builtin_arm_uqasx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqsax : GCCBuiltin<"__builtin_arm_uqsax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_uqsub16 : GCCBuiltin<"__builtin_arm_uqsub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; // Writes to the GE bits. def int_arm_usax : GCCBuiltin<"__builtin_arm_usax">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Writes to the GE bits. def int_arm_usub16 : GCCBuiltin<"__builtin_arm_usub16">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>; // Parallel 16-bit multiplication def int_arm_smlad : GCCBuiltin<"__builtin_arm_smlad">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smladx : GCCBuiltin<"__builtin_arm_smladx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlald : GCCBuiltin<"__builtin_arm_smlald">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smlaldx : GCCBuiltin<"__builtin_arm_smlaldx">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smlsd : GCCBuiltin<"__builtin_arm_smlsd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlsdx : GCCBuiltin<"__builtin_arm_smlsdx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smlsld : GCCBuiltin<"__builtin_arm_smlsld">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smlsldx : GCCBuiltin<"__builtin_arm_smlsldx">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; def int_arm_smuad : GCCBuiltin<"__builtin_arm_smuad">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smuadx : GCCBuiltin<"__builtin_arm_smuadx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smusd : GCCBuiltin<"__builtin_arm_smusd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_smusdx : GCCBuiltin<"__builtin_arm_smusdx">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // Load, Store and Clear exclusive def int_arm_ldrex : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>; def int_arm_strex : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_anyptr_ty]>; def int_arm_ldaex : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>; def int_arm_stlex : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_anyptr_ty]>; def int_arm_clrex : Intrinsic<[]>; def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>; def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>; def int_arm_stlexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>; def int_arm_ldaexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>; //===----------------------------------------------------------------------===// // Data barrier instructions def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">, Intrinsic<[], [llvm_i32_ty]>; def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, Intrinsic<[], [llvm_i32_ty]>; def int_arm_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // VFP def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">, Intrinsic<[llvm_i32_ty], [], []>; def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">, Intrinsic<[], [llvm_i32_ty], []>; def int_arm_vcvtr : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty], [IntrNoMem]>; def int_arm_vcvtru : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // Coprocessor def int_arm_ldc : GCCBuiltin<"__builtin_arm_ldc">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_ldcl : GCCBuiltin<"__builtin_arm_ldcl">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_ldc2 : GCCBuiltin<"__builtin_arm_ldc2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_ldc2l : GCCBuiltin<"__builtin_arm_ldc2l">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stc : GCCBuiltin<"__builtin_arm_stc">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stcl : GCCBuiltin<"__builtin_arm_stcl">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stc2 : GCCBuiltin<"__builtin_arm_stc2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; def int_arm_stc2l : GCCBuiltin<"__builtin_arm_stc2l">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>; // Move to coprocessor def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; // Move from coprocessor def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">, MSBuiltin<"_MoveFromCoprocessor">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>]>; def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">, MSBuiltin<"_MoveFromCoprocessor2">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>]>; // Coprocessor data processing def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>; // Move from two registers to coprocessor def int_arm_mcrr : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<4>]>; def int_arm_mcrr2 : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<4>]>; def int_arm_mrrc : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>]>; def int_arm_mrrc2 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>]>; //===----------------------------------------------------------------------===// // CRC32 def int_arm_crc32b : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32h : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32w : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // CMSE def int_arm_cmse_tt : GCCBuiltin<"__builtin_arm_cmse_TT">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_arm_cmse_ttt : GCCBuiltin<"__builtin_arm_cmse_TTT">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_arm_cmse_tta : GCCBuiltin<"__builtin_arm_cmse_TTA">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_arm_cmse_ttat : GCCBuiltin<"__builtin_arm_cmse_TTAT">, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; //===----------------------------------------------------------------------===// // HINT def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>; def int_arm_dbg : Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // UND (reserved undefined sequence) def int_arm_undefined : Intrinsic<[], [llvm_i32_ty]>; //===----------------------------------------------------------------------===// // Advanced SIMD (NEON) // The following classes do not correspond directly to GCC builtins. class Neon_1Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; class Neon_1Arg_Narrow_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedType<0>], [IntrNoMem]>; class Neon_2Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_2Arg_Narrow_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedType<0>, LLVMExtendedType<0>], [IntrNoMem]>; class Neon_2Arg_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>, LLVMTruncatedType<0>], [IntrNoMem]>; class Neon_3Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_3Arg_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMTruncatedType<0>, LLVMTruncatedType<0>], [IntrNoMem]>; class Neon_1FloatArg_Intrinsic : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; class Neon_CvtFxToFP_Intrinsic : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; class Neon_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; class Neon_CvtFPtoInt_1Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; class Neon_Compare_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; // The table operands for VTBL and VTBX consist of 1 to 4 v8i8 vectors. // Besides the table, VTBL has one other v8i8 argument and VTBX has two. // Overall, the classes range from 2 to 6 v8i8 arguments. class Neon_Tbl2Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl3Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl4Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl5Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; class Neon_Tbl6Arg_Intrinsic : Intrinsic<[llvm_v8i8_ty], [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>; // Arithmetic ops let IntrProperties = [IntrNoMem, Commutative] in { // Vector Add. def int_arm_neon_vhadds : Neon_2Arg_Intrinsic; def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic; def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vqadds : Neon_2Arg_Intrinsic; def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic; def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic; // Vector Multiply. def int_arm_neon_vmulp : Neon_2Arg_Intrinsic; def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic; def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic; def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic; // Vector Maximum. def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic; def int_arm_neon_vmaxnm : Neon_2Arg_Intrinsic; // Vector Minimum. def int_arm_neon_vmins : Neon_2Arg_Intrinsic; def int_arm_neon_vminu : Neon_2Arg_Intrinsic; def int_arm_neon_vminnm : Neon_2Arg_Intrinsic; // Vector Reciprocal Step. def int_arm_neon_vrecps : Neon_2Arg_Intrinsic; // Vector Reciprocal Square Root Step. def int_arm_neon_vrsqrts : Neon_2Arg_Intrinsic; } // Vector Subtract. def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic; def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic; def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic; def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic; def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic; // Vector Absolute Compare. def int_arm_neon_vacge : Neon_Compare_Intrinsic; def int_arm_neon_vacgt : Neon_Compare_Intrinsic; // Vector Absolute Differences. def int_arm_neon_vabds : Neon_2Arg_Intrinsic; def int_arm_neon_vabdu : Neon_2Arg_Intrinsic; // Vector Pairwise Add. def int_arm_neon_vpadd : Neon_2Arg_Intrinsic; // Vector Pairwise Add Long. // Note: This is different than the other "long" NEON intrinsics because // the result vector has half as many elements as the source vector. // The source and destination vector types must be specified separately. def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; // Vector Pairwise Add and Accumulate Long. // Note: This is similar to vpaddl but the destination vector also appears // as the first argument. def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; // Vector Pairwise Maximum and Minimum. def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic; def int_arm_neon_vpmins : Neon_2Arg_Intrinsic; def int_arm_neon_vpminu : Neon_2Arg_Intrinsic; // Vector Shifts: // // The various saturating and rounding vector shift operations need to be // represented by intrinsics in LLVM, and even the basic VSHL variable shift // operation cannot be safely translated to LLVM's shift operators. VSHL can // be used for both left and right shifts, or even combinations of the two, // depending on the signs of the shift amounts. It also has well-defined // behavior for shift amounts that LLVM leaves undefined. Only basic shifts // by constants can be represented with LLVM's shift operators. // // The shift counts for these intrinsics are always vectors, even for constant // shifts, where the constant is replicated. For consistency with VSHL (and // other variable shift instructions), left shifts have positive shift counts // and right shifts have negative shift counts. This convention is also used // for constant right shift intrinsics, and to help preserve sanity, the // intrinsic names use "shift" instead of either "shl" or "shr". Where // applicable, signed and unsigned versions of the intrinsics are // distinguished with "s" and "u" suffixes. A few NEON shift instructions, // such as VQSHLU, take signed operands but produce unsigned results; these // use a "su" suffix. // Vector Shift. def int_arm_neon_vshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic; // Vector Rounding Shift. def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic; // Vector Saturating Shift. def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic; def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic; // Vector Saturating Rounding Shift. def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic; def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic; // Vector Shift and Insert. def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic; // Vector Absolute Value and Saturating Absolute Value. def int_arm_neon_vabs : Neon_1Arg_Intrinsic; def int_arm_neon_vqabs : Neon_1Arg_Intrinsic; // Vector Saturating Negate. def int_arm_neon_vqneg : Neon_1Arg_Intrinsic; // Vector Count Leading Sign/Zero Bits. def int_arm_neon_vcls : Neon_1Arg_Intrinsic; // Vector Reciprocal Estimate. def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic; // Vector Reciprocal Square Root Estimate. def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic; // Vector Conversions Between Floating-point and Integer def int_arm_neon_vcvtau : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtas : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtnu : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtns : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtpu : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtps : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtmu : Neon_CvtFPtoInt_1Arg_Intrinsic; def int_arm_neon_vcvtms : Neon_CvtFPtoInt_1Arg_Intrinsic; // Vector Conversions Between Floating-point and Fixed-point. def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic; def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic; def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic; def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic; // Vector Conversions Between Half-Precision and Single-Precision. def int_arm_neon_vcvtfp2hf : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_arm_neon_vcvthf2fp : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>; // Narrowing Saturating Vector Moves. def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic; // Vector Table Lookup. // The first 1-4 arguments are the table. def int_arm_neon_vtbl1 : Neon_Tbl2Arg_Intrinsic; def int_arm_neon_vtbl2 : Neon_Tbl3Arg_Intrinsic; def int_arm_neon_vtbl3 : Neon_Tbl4Arg_Intrinsic; def int_arm_neon_vtbl4 : Neon_Tbl5Arg_Intrinsic; // Vector Table Extension. // Some elements of the destination vector may not be updated, so the original // value of that vector is passed as the first argument. The next 1-4 // arguments after that are the table. def int_arm_neon_vtbx1 : Neon_Tbl3Arg_Intrinsic; def int_arm_neon_vtbx2 : Neon_Tbl4Arg_Intrinsic; def int_arm_neon_vtbx3 : Neon_Tbl5Arg_Intrinsic; def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic; // Vector and Scalar Rounding. def int_arm_neon_vrintn : Neon_1FloatArg_Intrinsic; def int_arm_neon_vrintx : Neon_1Arg_Intrinsic; def int_arm_neon_vrinta : Neon_1Arg_Intrinsic; def int_arm_neon_vrintz : Neon_1Arg_Intrinsic; def int_arm_neon_vrintm : Neon_1Arg_Intrinsic; def int_arm_neon_vrintp : Neon_1Arg_Intrinsic; // De-interleaving vector loads from N-element structures. // Source operands are the address and alignment. def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [LLVMAnyPointerType>], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [LLVMAnyPointerType>], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [LLVMAnyPointerType>], [IntrReadMem, IntrArgMemOnly]>; // Vector load N-element structure to one lane. // Source operands are: the address, the N input vectors (since only one // lane is assigned), the lane number, and the alignment. def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; // Vector load N-element structure to all lanes. // Source operands are the address and alignment. def int_arm_neon_vld2dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld3dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_arm_neon_vld4dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; // Interleaving vector stores from N-element structures. // Source operands are: the address, the N vectors, and the alignment. def int_arm_neon_vst1 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst2 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst3 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst4 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst1x2 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>], [IntrArgMemOnly, NoCapture<0>]>; def int_arm_neon_vst1x3 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>], [IntrArgMemOnly, NoCapture<0>]>; def int_arm_neon_vst1x4 : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>], [IntrArgMemOnly, NoCapture<0>]>; // Vector store N-element structure from one lane. // Source operands are: the address, the N vectors, the lane number, and // the alignment. def int_arm_neon_vst2lane : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst3lane : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_arm_neon_vst4lane : Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; // Vector bitwise select. def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; // Crypto instructions class AES_1Arg_Intrinsic : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; class AES_2Arg_Intrinsic : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; class SHA_1Arg_Intrinsic : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; class SHA_2Arg_Intrinsic : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; class SHA_3Arg_i32_Intrinsic : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty], [IntrNoMem]>; class SHA_3Arg_v4i32_Intrinsic : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,llvm_v4i32_ty], [IntrNoMem]>; def int_arm_neon_aesd : AES_2Arg_Intrinsic; def int_arm_neon_aese : AES_2Arg_Intrinsic; def int_arm_neon_aesimc : AES_1Arg_Intrinsic; def int_arm_neon_aesmc : AES_1Arg_Intrinsic; def int_arm_neon_sha1h : SHA_1Arg_Intrinsic; def int_arm_neon_sha1su1 : SHA_2Arg_Intrinsic; def int_arm_neon_sha256su0 : SHA_2Arg_Intrinsic; def int_arm_neon_sha1c : SHA_3Arg_i32_Intrinsic; def int_arm_neon_sha1m : SHA_3Arg_i32_Intrinsic; def int_arm_neon_sha1p : SHA_3Arg_i32_Intrinsic; def int_arm_neon_sha1su0: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256h: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic; // Armv8.2-A dot product instructions class Neon_Dot_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; def int_arm_neon_udot : Neon_Dot_Intrinsic; def int_arm_neon_sdot : Neon_Dot_Intrinsic; -// GNU eabi mcount -def int_arm_gnu_eabi_mcount : Intrinsic<[], - [], - [IntrReadMem, IntrWriteMem]>; - } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 73ebe0940f32..bd4ca3828fc0 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1,1989 +1,1958 @@ //===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains a pass that expands pseudo instructions into target // instructions to allow proper scheduling, if-conversion, and other late // optimizations. This pass should be run after register allocation but before // the post-regalloc scheduling pass. // //===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/Support/Debug.h" using namespace llvm; #define DEBUG_TYPE "arm-pseudo" static cl::opt VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden, cl::desc("Verify machine code after expanding ARM pseudos")); #define ARM_EXPAND_PSEUDO_NAME "ARM pseudo instruction expansion pass" namespace { class ARMExpandPseudo : public MachineFunctionPass { public: static char ID; ARMExpandPseudo() : MachineFunctionPass(ID) {} const ARMBaseInstrInfo *TII; const TargetRegisterInfo *TRI; const ARMSubtarget *STI; ARMFunctionInfo *AFI; bool runOnMachineFunction(MachineFunction &Fn) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); } StringRef getPassName() const override { return ARM_EXPAND_PSEUDO_NAME; } private: void TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool ExpandMBB(MachineBasicBlock &MBB); void ExpandVLD(MachineBasicBlock::iterator &MBBI); void ExpandVST(MachineBasicBlock::iterator &MBBI); void ExpandLaneOp(MachineBasicBlock::iterator &MBBI); void ExpandVTBL(MachineBasicBlock::iterator &MBBI, unsigned Opc, bool IsExt); void ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); bool ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, unsigned UxtOp, MachineBasicBlock::iterator &NextMBBI); bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); }; char ARMExpandPseudo::ID = 0; } INITIALIZE_PASS(ARMExpandPseudo, DEBUG_TYPE, ARM_EXPAND_PSEUDO_NAME, false, false) /// TransferImpOps - Transfer implicit operands on the pseudo instruction to /// the instructions created from the expansion. void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI) { const MCInstrDesc &Desc = OldMI.getDesc(); for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = OldMI.getOperand(i); assert(MO.isReg() && MO.getReg()); if (MO.isUse()) UseMI.add(MO); else DefMI.add(MO); } } namespace { // Constants for register spacing in NEON load/store instructions. // For quad-register load-lane and store-lane pseudo instructors, the // spacing is initially assumed to be EvenDblSpc, and that is changed to // OddDblSpc depending on the lane number operand. enum NEONRegSpacing { SingleSpc, SingleLowSpc , // Single spacing, low registers, three and four vectors. SingleHighQSpc, // Single spacing, high registers, four vectors. SingleHighTSpc, // Single spacing, high registers, three vectors. EvenDblSpc, OddDblSpc }; // Entries for NEON load/store information table. The table is sorted by // PseudoOpc for fast binary-search lookups. struct NEONLdStTableEntry { uint16_t PseudoOpc; uint16_t RealOpc; bool IsLoad; bool isUpdating; bool hasWritebackOperand; uint8_t RegSpacing; // One of type NEONRegSpacing uint8_t NumRegs; // D registers loaded or stored uint8_t RegElts; // elements per D register; used for lane ops // FIXME: Temporary flag to denote whether the real instruction takes // a single register (like the encoding) or all of the registers in // the list (like the asm syntax and the isel DAG). When all definitions // are converted to take only the single encoded register, this will // go away. bool copyAllListRegs; // Comparison methods for binary search of the table. bool operator<(const NEONLdStTableEntry &TE) const { return PseudoOpc < TE.PseudoOpc; } friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) { return TE.PseudoOpc < PseudoOpc; } friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc, const NEONLdStTableEntry &TE) { return PseudoOpc < TE.PseudoOpc; } }; } static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, false, EvenDblSpc, 1, 4 ,true}, { ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true, EvenDblSpc, 1, 4 ,true}, { ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, false, EvenDblSpc, 1, 2 ,true}, { ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true, EvenDblSpc, 1, 2 ,true}, { ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, false, EvenDblSpc, 1, 8 ,true}, { ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true}, { ARM::VLD1d16QPseudo, ARM::VLD1d16Q, true, false, false, SingleSpc, 4, 4 ,false}, { ARM::VLD1d16TPseudo, ARM::VLD1d16T, true, false, false, SingleSpc, 3, 4 ,false}, { ARM::VLD1d32QPseudo, ARM::VLD1d32Q, true, false, false, SingleSpc, 4, 2 ,false}, { ARM::VLD1d32TPseudo, ARM::VLD1d32T, true, false, false, SingleSpc, 3, 2 ,false}, { ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64QPseudoWB_register, ARM::VLD1d64Qwb_register, true, true, true, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false}, { ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false}, { ARM::VLD1d64TPseudoWB_register, ARM::VLD1d64Twb_register, true, true, true, SingleSpc, 3, 1 ,false}, { ARM::VLD1d8QPseudo, ARM::VLD1d8Q, true, false, false, SingleSpc, 4, 8 ,false}, { ARM::VLD1d8TPseudo, ARM::VLD1d8T, true, false, false, SingleSpc, 3, 8 ,false}, { ARM::VLD1q16HighQPseudo, ARM::VLD1d16Q, true, false, false, SingleHighQSpc, 4, 4 ,false}, { ARM::VLD1q16HighTPseudo, ARM::VLD1d16T, true, false, false, SingleHighTSpc, 3, 4 ,false}, { ARM::VLD1q16LowQPseudo_UPD, ARM::VLD1d16Qwb_fixed, true, true, true, SingleLowSpc, 4, 4 ,false}, { ARM::VLD1q16LowTPseudo_UPD, ARM::VLD1d16Twb_fixed, true, true, true, SingleLowSpc, 3, 4 ,false}, { ARM::VLD1q32HighQPseudo, ARM::VLD1d32Q, true, false, false, SingleHighQSpc, 4, 2 ,false}, { ARM::VLD1q32HighTPseudo, ARM::VLD1d32T, true, false, false, SingleHighTSpc, 3, 2 ,false}, { ARM::VLD1q32LowQPseudo_UPD, ARM::VLD1d32Qwb_fixed, true, true, true, SingleLowSpc, 4, 2 ,false}, { ARM::VLD1q32LowTPseudo_UPD, ARM::VLD1d32Twb_fixed, true, true, true, SingleLowSpc, 3, 2 ,false}, { ARM::VLD1q64HighQPseudo, ARM::VLD1d64Q, true, false, false, SingleHighQSpc, 4, 1 ,false}, { ARM::VLD1q64HighTPseudo, ARM::VLD1d64T, true, false, false, SingleHighTSpc, 3, 1 ,false}, { ARM::VLD1q64LowQPseudo_UPD, ARM::VLD1d64Qwb_fixed, true, true, true, SingleLowSpc, 4, 1 ,false}, { ARM::VLD1q64LowTPseudo_UPD, ARM::VLD1d64Twb_fixed, true, true, true, SingleLowSpc, 3, 1 ,false}, { ARM::VLD1q8HighQPseudo, ARM::VLD1d8Q, true, false, false, SingleHighQSpc, 4, 8 ,false}, { ARM::VLD1q8HighTPseudo, ARM::VLD1d8T, true, false, false, SingleHighTSpc, 3, 8 ,false}, { ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false}, { ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false}, { ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false}, { ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false}, { ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false}, { ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false}, { ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false}, { ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false}, { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true}, { ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true, SingleSpc, 2, 2 ,true}, { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, false, SingleSpc, 2, 8 ,true}, { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, true, SingleSpc, 2, 8 ,true}, { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, false, EvenDblSpc, 2, 4 ,true}, { ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true, EvenDblSpc, 2, 4 ,true}, { ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, false, EvenDblSpc, 2, 2 ,true}, { ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true, EvenDblSpc, 2, 2 ,true}, { ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, false, SingleSpc, 4, 4 ,false}, { ARM::VLD2q16PseudoWB_fixed, ARM::VLD2q16wb_fixed, true, true, false, SingleSpc, 4, 4 ,false}, { ARM::VLD2q16PseudoWB_register, ARM::VLD2q16wb_register, true, true, true, SingleSpc, 4, 4 ,false}, { ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, false, SingleSpc, 4, 2 ,false}, { ARM::VLD2q32PseudoWB_fixed, ARM::VLD2q32wb_fixed, true, true, false, SingleSpc, 4, 2 ,false}, { ARM::VLD2q32PseudoWB_register, ARM::VLD2q32wb_register, true, true, true, SingleSpc, 4, 2 ,false}, { ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, false, SingleSpc, 4, 8 ,false}, { ARM::VLD2q8PseudoWB_fixed, ARM::VLD2q8wb_fixed, true, true, false, SingleSpc, 4, 8 ,false}, { ARM::VLD2q8PseudoWB_register, ARM::VLD2q8wb_register, true, true, true, SingleSpc, 4, 8 ,false}, { ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, false, SingleSpc, 3, 4,true}, { ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true, SingleSpc, 3, 4,true}, { ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, false, SingleSpc, 3, 2,true}, { ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true}, { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true}, { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true}, { ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true}, { ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true}, { ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true}, { ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true}, { ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true}, { ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true}, { ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true}, { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, { ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, false, SingleSpc, 3, 2 ,true}, { ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true, SingleSpc, 3, 2 ,true}, { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, false, SingleSpc, 3, 8 ,true}, { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, true, SingleSpc, 3, 8 ,true}, { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, false, EvenDblSpc, 3, 4 ,true}, { ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true}, { ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, false, EvenDblSpc, 3, 2 ,true}, { ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true}, { ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, false, SingleSpc, 3, 4 ,true}, { ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, { ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, false, SingleSpc, 3, 2 ,true}, { ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, true, SingleSpc, 3, 2 ,true}, { ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, false, SingleSpc, 3, 8 ,true}, { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, true, SingleSpc, 3, 8 ,true}, { ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true}, { ARM::VLD3q16oddPseudo, ARM::VLD3q16, true, false, false, OddDblSpc, 3, 4 ,true}, { ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true, OddDblSpc, 3, 4 ,true}, { ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true}, { ARM::VLD3q32oddPseudo, ARM::VLD3q32, true, false, false, OddDblSpc, 3, 2 ,true}, { ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true, OddDblSpc, 3, 2 ,true}, { ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, true, EvenDblSpc, 3, 8 ,true}, { ARM::VLD3q8oddPseudo, ARM::VLD3q8, true, false, false, OddDblSpc, 3, 8 ,true}, { ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, true, OddDblSpc, 3, 8 ,true}, { ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, false, SingleSpc, 4, 4,true}, { ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true, SingleSpc, 4, 4,true}, { ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, false, SingleSpc, 4, 2,true}, { ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true}, { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true}, { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true}, { ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true}, { ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true}, { ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true}, { ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true}, { ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true}, { ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true}, { ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true}, { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, { ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, false, SingleSpc, 4, 2 ,true}, { ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true, SingleSpc, 4, 2 ,true}, { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, false, SingleSpc, 4, 8 ,true}, { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, true, SingleSpc, 4, 8 ,true}, { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, false, EvenDblSpc, 4, 4 ,true}, { ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true}, { ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, false, EvenDblSpc, 4, 2 ,true}, { ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true}, { ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, false, SingleSpc, 4, 4 ,true}, { ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, { ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, false, SingleSpc, 4, 2 ,true}, { ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, true, SingleSpc, 4, 2 ,true}, { ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, false, SingleSpc, 4, 8 ,true}, { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, true, SingleSpc, 4, 8 ,true}, { ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true}, { ARM::VLD4q16oddPseudo, ARM::VLD4q16, true, false, false, OddDblSpc, 4, 4 ,true}, { ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true, OddDblSpc, 4, 4 ,true}, { ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true}, { ARM::VLD4q32oddPseudo, ARM::VLD4q32, true, false, false, OddDblSpc, 4, 2 ,true}, { ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true, OddDblSpc, 4, 2 ,true}, { ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, true, EvenDblSpc, 4, 8 ,true}, { ARM::VLD4q8oddPseudo, ARM::VLD4q8, true, false, false, OddDblSpc, 4, 8 ,true}, { ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, true, OddDblSpc, 4, 8 ,true}, { ARM::VST1LNq16Pseudo, ARM::VST1LNd16, false, false, false, EvenDblSpc, 1, 4 ,true}, { ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true, EvenDblSpc, 1, 4 ,true}, { ARM::VST1LNq32Pseudo, ARM::VST1LNd32, false, false, false, EvenDblSpc, 1, 2 ,true}, { ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true, EvenDblSpc, 1, 2 ,true}, { ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true}, { ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true}, { ARM::VST1d16QPseudo, ARM::VST1d16Q, false, false, false, SingleSpc, 4, 4 ,false}, { ARM::VST1d16TPseudo, ARM::VST1d16T, false, false, false, SingleSpc, 3, 4 ,false}, { ARM::VST1d32QPseudo, ARM::VST1d32Q, false, false, false, SingleSpc, 4, 2 ,false}, { ARM::VST1d32TPseudo, ARM::VST1d32T, false, false, false, SingleSpc, 3, 2 ,false}, { ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false}, { ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false}, { ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false}, { ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false}, { ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false}, { ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false}, { ARM::VST1d8QPseudo, ARM::VST1d8Q, false, false, false, SingleSpc, 4, 8 ,false}, { ARM::VST1d8TPseudo, ARM::VST1d8T, false, false, false, SingleSpc, 3, 8 ,false}, { ARM::VST1q16HighQPseudo, ARM::VST1d16Q, false, false, false, SingleHighQSpc, 4, 4 ,false}, { ARM::VST1q16HighTPseudo, ARM::VST1d16T, false, false, false, SingleHighTSpc, 3, 4 ,false}, { ARM::VST1q16LowQPseudo_UPD, ARM::VST1d16Qwb_fixed, false, true, true, SingleLowSpc, 4, 4 ,false}, { ARM::VST1q16LowTPseudo_UPD, ARM::VST1d16Twb_fixed, false, true, true, SingleLowSpc, 3, 4 ,false}, { ARM::VST1q32HighQPseudo, ARM::VST1d32Q, false, false, false, SingleHighQSpc, 4, 2 ,false}, { ARM::VST1q32HighTPseudo, ARM::VST1d32T, false, false, false, SingleHighTSpc, 3, 2 ,false}, { ARM::VST1q32LowQPseudo_UPD, ARM::VST1d32Qwb_fixed, false, true, true, SingleLowSpc, 4, 2 ,false}, { ARM::VST1q32LowTPseudo_UPD, ARM::VST1d32Twb_fixed, false, true, true, SingleLowSpc, 3, 2 ,false}, { ARM::VST1q64HighQPseudo, ARM::VST1d64Q, false, false, false, SingleHighQSpc, 4, 1 ,false}, { ARM::VST1q64HighTPseudo, ARM::VST1d64T, false, false, false, SingleHighTSpc, 3, 1 ,false}, { ARM::VST1q64LowQPseudo_UPD, ARM::VST1d64Qwb_fixed, false, true, true, SingleLowSpc, 4, 1 ,false}, { ARM::VST1q64LowTPseudo_UPD, ARM::VST1d64Twb_fixed, false, true, true, SingleLowSpc, 3, 1 ,false}, { ARM::VST1q8HighQPseudo, ARM::VST1d8Q, false, false, false, SingleHighQSpc, 4, 8 ,false}, { ARM::VST1q8HighTPseudo, ARM::VST1d8T, false, false, false, SingleHighTSpc, 3, 8 ,false}, { ARM::VST1q8LowQPseudo_UPD, ARM::VST1d8Qwb_fixed, false, true, true, SingleLowSpc, 4, 8 ,false}, { ARM::VST1q8LowTPseudo_UPD, ARM::VST1d8Twb_fixed, false, true, true, SingleLowSpc, 3, 8 ,false}, { ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true}, { ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true}, { ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, false, SingleSpc, 2, 2 ,true}, { ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true, SingleSpc, 2, 2 ,true}, { ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, false, SingleSpc, 2, 8 ,true}, { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, true, SingleSpc, 2, 8 ,true}, { ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, false, EvenDblSpc, 2, 4,true}, { ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true, EvenDblSpc, 2, 4,true}, { ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, false, EvenDblSpc, 2, 2,true}, { ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true, EvenDblSpc, 2, 2,true}, { ARM::VST2q16Pseudo, ARM::VST2q16, false, false, false, SingleSpc, 4, 4 ,false}, { ARM::VST2q16PseudoWB_fixed, ARM::VST2q16wb_fixed, false, true, false, SingleSpc, 4, 4 ,false}, { ARM::VST2q16PseudoWB_register, ARM::VST2q16wb_register, false, true, true, SingleSpc, 4, 4 ,false}, { ARM::VST2q32Pseudo, ARM::VST2q32, false, false, false, SingleSpc, 4, 2 ,false}, { ARM::VST2q32PseudoWB_fixed, ARM::VST2q32wb_fixed, false, true, false, SingleSpc, 4, 2 ,false}, { ARM::VST2q32PseudoWB_register, ARM::VST2q32wb_register, false, true, true, SingleSpc, 4, 2 ,false}, { ARM::VST2q8Pseudo, ARM::VST2q8, false, false, false, SingleSpc, 4, 8 ,false}, { ARM::VST2q8PseudoWB_fixed, ARM::VST2q8wb_fixed, false, true, false, SingleSpc, 4, 8 ,false}, { ARM::VST2q8PseudoWB_register, ARM::VST2q8wb_register, false, true, true, SingleSpc, 4, 8 ,false}, { ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, false, SingleSpc, 3, 4 ,true}, { ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true, SingleSpc, 3, 4 ,true}, { ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, false, SingleSpc, 3, 2 ,true}, { ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true, SingleSpc, 3, 2 ,true}, { ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, false, SingleSpc, 3, 8 ,true}, { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, true, SingleSpc, 3, 8 ,true}, { ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, false, EvenDblSpc, 3, 4,true}, { ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true, EvenDblSpc, 3, 4,true}, { ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, false, EvenDblSpc, 3, 2,true}, { ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true, EvenDblSpc, 3, 2,true}, { ARM::VST3d16Pseudo, ARM::VST3d16, false, false, false, SingleSpc, 3, 4 ,true}, { ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, true, SingleSpc, 3, 4 ,true}, { ARM::VST3d32Pseudo, ARM::VST3d32, false, false, false, SingleSpc, 3, 2 ,true}, { ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, true, SingleSpc, 3, 2 ,true}, { ARM::VST3d8Pseudo, ARM::VST3d8, false, false, false, SingleSpc, 3, 8 ,true}, { ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, true, SingleSpc, 3, 8 ,true}, { ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, true, EvenDblSpc, 3, 4 ,true}, { ARM::VST3q16oddPseudo, ARM::VST3q16, false, false, false, OddDblSpc, 3, 4 ,true}, { ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true, OddDblSpc, 3, 4 ,true}, { ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, true, EvenDblSpc, 3, 2 ,true}, { ARM::VST3q32oddPseudo, ARM::VST3q32, false, false, false, OddDblSpc, 3, 2 ,true}, { ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true, OddDblSpc, 3, 2 ,true}, { ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, true, EvenDblSpc, 3, 8 ,true}, { ARM::VST3q8oddPseudo, ARM::VST3q8, false, false, false, OddDblSpc, 3, 8 ,true}, { ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, true, OddDblSpc, 3, 8 ,true}, { ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, false, SingleSpc, 4, 4 ,true}, { ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true, SingleSpc, 4, 4 ,true}, { ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, false, SingleSpc, 4, 2 ,true}, { ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true, SingleSpc, 4, 2 ,true}, { ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, false, SingleSpc, 4, 8 ,true}, { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, true, SingleSpc, 4, 8 ,true}, { ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, false, EvenDblSpc, 4, 4,true}, { ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true, EvenDblSpc, 4, 4,true}, { ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, false, EvenDblSpc, 4, 2,true}, { ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true, EvenDblSpc, 4, 2,true}, { ARM::VST4d16Pseudo, ARM::VST4d16, false, false, false, SingleSpc, 4, 4 ,true}, { ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, true, SingleSpc, 4, 4 ,true}, { ARM::VST4d32Pseudo, ARM::VST4d32, false, false, false, SingleSpc, 4, 2 ,true}, { ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, true, SingleSpc, 4, 2 ,true}, { ARM::VST4d8Pseudo, ARM::VST4d8, false, false, false, SingleSpc, 4, 8 ,true}, { ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, true, SingleSpc, 4, 8 ,true}, { ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, true, EvenDblSpc, 4, 4 ,true}, { ARM::VST4q16oddPseudo, ARM::VST4q16, false, false, false, OddDblSpc, 4, 4 ,true}, { ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true, OddDblSpc, 4, 4 ,true}, { ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, true, EvenDblSpc, 4, 2 ,true}, { ARM::VST4q32oddPseudo, ARM::VST4q32, false, false, false, OddDblSpc, 4, 2 ,true}, { ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true, OddDblSpc, 4, 2 ,true}, { ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, true, EvenDblSpc, 4, 8 ,true}, { ARM::VST4q8oddPseudo, ARM::VST4q8, false, false, false, OddDblSpc, 4, 8 ,true}, { ARM::VST4q8oddPseudo_UPD, ARM::VST4q8_UPD, false, true, true, OddDblSpc, 4, 8 ,true} }; /// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON /// load or store pseudo instruction. static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { #ifndef NDEBUG // Make sure the table is sorted. static std::atomic TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) && "NEONLdStTable is not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } #endif auto I = llvm::lower_bound(NEONLdStTable, Opcode); if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode) return I; return nullptr; } /// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register, /// corresponding to the specified register spacing. Not all of the results /// are necessarily valid, e.g., a Q register only has 2 D subregisters. static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, const TargetRegisterInfo *TRI, unsigned &D0, unsigned &D1, unsigned &D2, unsigned &D3) { if (RegSpc == SingleSpc || RegSpc == SingleLowSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_0); D1 = TRI->getSubReg(Reg, ARM::dsub_1); D2 = TRI->getSubReg(Reg, ARM::dsub_2); D3 = TRI->getSubReg(Reg, ARM::dsub_3); } else if (RegSpc == SingleHighQSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_4); D1 = TRI->getSubReg(Reg, ARM::dsub_5); D2 = TRI->getSubReg(Reg, ARM::dsub_6); D3 = TRI->getSubReg(Reg, ARM::dsub_7); } else if (RegSpc == SingleHighTSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_3); D1 = TRI->getSubReg(Reg, ARM::dsub_4); D2 = TRI->getSubReg(Reg, ARM::dsub_5); D3 = TRI->getSubReg(Reg, ARM::dsub_6); } else if (RegSpc == EvenDblSpc) { D0 = TRI->getSubReg(Reg, ARM::dsub_0); D1 = TRI->getSubReg(Reg, ARM::dsub_2); D2 = TRI->getSubReg(Reg, ARM::dsub_4); D3 = TRI->getSubReg(Reg, ARM::dsub_6); } else { assert(RegSpc == OddDblSpc && "unknown register spacing"); D0 = TRI->getSubReg(Reg, ARM::dsub_1); D1 = TRI->getSubReg(Reg, ARM::dsub_3); D2 = TRI->getSubReg(Reg, ARM::dsub_5); D3 = TRI->getSubReg(Reg, ARM::dsub_7); } } /// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register /// operands to real VLD instructions with D register operands. void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; unsigned NumRegs = TableEntry->NumRegs; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; bool DstIsDead = MI.getOperand(OpIdx).isDead(); Register DstReg = MI.getOperand(OpIdx++).getReg(); if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || TableEntry->RealOpc == ARM::VLD2DUPd16x2 || TableEntry->RealOpc == ARM::VLD2DUPd32x2) { unsigned SubRegIndex; if (RegSpc == EvenDblSpc) { SubRegIndex = ARM::dsub_0; } else { assert(RegSpc == OddDblSpc && "Unexpected spacing!"); SubRegIndex = ARM::dsub_1; } Register SubReg = TRI->getSubReg(DstReg, SubRegIndex); unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, &ARM::DPairSpcRegClass); MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead)); } else { unsigned D0, D1, D2, D3; GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 1 && TableEntry->copyAllListRegs) MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 2 && TableEntry->copyAllListRegs) MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 3 && TableEntry->copyAllListRegs) MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); } if (TableEntry->isUpdating) MIB.add(MI.getOperand(OpIdx++)); // Copy the addrmode6 operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Copy the am6offset operand. if (TableEntry->hasWritebackOperand) { // TODO: The writing-back pseudo instructions we translate here are all // defined to take am6offset nodes that are capable to represent both fixed // and register forms. Some real instructions, however, do not rely on // am6offset and have separate definitions for such forms. When this is the // case, fixed forms do not take any offset nodes, so here we skip them for // such instructions. Once all real and pseudo writing-back instructions are // rewritten without use of am6offset nodes, this code will go away. const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); if (TableEntry->RealOpc == ARM::VLD1d8Qwb_fixed || TableEntry->RealOpc == ARM::VLD1d16Qwb_fixed || TableEntry->RealOpc == ARM::VLD1d32Qwb_fixed || TableEntry->RealOpc == ARM::VLD1d64Qwb_fixed || TableEntry->RealOpc == ARM::VLD1d8Twb_fixed || TableEntry->RealOpc == ARM::VLD1d16Twb_fixed || TableEntry->RealOpc == ARM::VLD1d32Twb_fixed || TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) { assert(AM6Offset.getReg() == 0 && "A fixed writing-back pseudo instruction provides an offset " "register!"); } else { MIB.add(AM6Offset); } } // For an instruction writing double-spaced subregs, the pseudo instruction // has an extra operand that is a use of the super-register. Record the // operand index and skip over it. unsigned SrcOpIdx = 0; if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 && TableEntry->RealOpc != ARM::VLD2DUPd16x2 && TableEntry->RealOpc != ARM::VLD2DUPd32x2) { if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || RegSpc == SingleHighTSpc) SrcOpIdx = OpIdx++; } // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Copy the super-register source operand used for double-spaced subregs over // to the new instruction as an implicit operand. if (SrcOpIdx != 0) { MachineOperand MO = MI.getOperand(SrcOpIdx); MO.setImplicit(true); MIB.add(MO); } // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); // Transfer memoperands. MIB.cloneMemRefs(MI); MI.eraseFromParent(); LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } /// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register /// operands to real VST instructions with D register operands. void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; unsigned NumRegs = TableEntry->NumRegs; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; if (TableEntry->isUpdating) MIB.add(MI.getOperand(OpIdx++)); // Copy the addrmode6 operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); if (TableEntry->hasWritebackOperand) { // TODO: The writing-back pseudo instructions we translate here are all // defined to take am6offset nodes that are capable to represent both fixed // and register forms. Some real instructions, however, do not rely on // am6offset and have separate definitions for such forms. When this is the // case, fixed forms do not take any offset nodes, so here we skip them for // such instructions. Once all real and pseudo writing-back instructions are // rewritten without use of am6offset nodes, this code will go away. const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); if (TableEntry->RealOpc == ARM::VST1d8Qwb_fixed || TableEntry->RealOpc == ARM::VST1d16Qwb_fixed || TableEntry->RealOpc == ARM::VST1d32Qwb_fixed || TableEntry->RealOpc == ARM::VST1d64Qwb_fixed || TableEntry->RealOpc == ARM::VST1d8Twb_fixed || TableEntry->RealOpc == ARM::VST1d16Twb_fixed || TableEntry->RealOpc == ARM::VST1d32Twb_fixed || TableEntry->RealOpc == ARM::VST1d64Twb_fixed) { assert(AM6Offset.getReg() == 0 && "A fixed writing-back pseudo instruction provides an offset " "register!"); } else { MIB.add(AM6Offset); } } bool SrcIsKill = MI.getOperand(OpIdx).isKill(); bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); Register SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, getUndefRegState(SrcIsUndef)); if (NumRegs > 1 && TableEntry->copyAllListRegs) MIB.addReg(D1, getUndefRegState(SrcIsUndef)); if (NumRegs > 2 && TableEntry->copyAllListRegs) MIB.addReg(D2, getUndefRegState(SrcIsUndef)); if (NumRegs > 3 && TableEntry->copyAllListRegs) MIB.addReg(D3, getUndefRegState(SrcIsUndef)); // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg. MIB->addRegisterKilled(SrcReg, TRI, true); else if (!SrcIsUndef) MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg. TransferImpOps(MI, MIB, MIB); // Transfer memoperands. MIB.cloneMemRefs(MI); MI.eraseFromParent(); LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } /// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ /// register operands to real instructions with D register operands. void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && "NEONLdStTable lookup failed"); NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; unsigned NumRegs = TableEntry->NumRegs; unsigned RegElts = TableEntry->RegElts; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; // The lane operand is always the 3rd from last operand, before the 2 // predicate operands. unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm(); // Adjust the lane and spacing as needed for Q registers. assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane"); if (RegSpc == EvenDblSpc && Lane >= RegElts) { RegSpc = OddDblSpc; Lane -= RegElts; } assert(Lane < RegElts && "out of range lane for VLD/VST-lane"); unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0; unsigned DstReg = 0; bool DstIsDead = false; if (TableEntry->IsLoad) { DstIsDead = MI.getOperand(OpIdx).isDead(); DstReg = MI.getOperand(OpIdx++).getReg(); GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 1) MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 2) MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 3) MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); } if (TableEntry->isUpdating) MIB.add(MI.getOperand(OpIdx++)); // Copy the addrmode6 operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Copy the am6offset operand. if (TableEntry->hasWritebackOperand) MIB.add(MI.getOperand(OpIdx++)); // Grab the super-register source. MachineOperand MO = MI.getOperand(OpIdx++); if (!TableEntry->IsLoad) GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3); // Add the subregs as sources of the new instruction. unsigned SrcFlags = (getUndefRegState(MO.isUndef()) | getKillRegState(MO.isKill())); MIB.addReg(D0, SrcFlags); if (NumRegs > 1) MIB.addReg(D1, SrcFlags); if (NumRegs > 2) MIB.addReg(D2, SrcFlags); if (NumRegs > 3) MIB.addReg(D3, SrcFlags); // Add the lane number operand. MIB.addImm(Lane); OpIdx += 1; // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Copy the super-register source to be an implicit source. MO.setImplicit(true); MIB.add(MO); if (TableEntry->IsLoad) // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); // Transfer memoperands. MIB.cloneMemRefs(MI); MI.eraseFromParent(); } /// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ /// register operands to real instructions with D register operands. void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, unsigned Opc, bool IsExt) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); unsigned OpIdx = 0; // Transfer the destination register operand. MIB.add(MI.getOperand(OpIdx++)); if (IsExt) { MachineOperand VdSrc(MI.getOperand(OpIdx++)); MIB.add(VdSrc); } bool SrcIsKill = MI.getOperand(OpIdx).isKill(); Register SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0); // Copy the other source register operand. MachineOperand VmSrc(MI.getOperand(OpIdx++)); MIB.add(VmSrc); // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Add an implicit kill and use for the super-reg. MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill)); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } static bool IsAnAddressOperand(const MachineOperand &MO) { // This check is overly conservative. Unless we are certain that the machine // operand is not a symbol reference, we return that it is a symbol reference. // This is important as the load pair may not be split up Windows. switch (MO.getType()) { case MachineOperand::MO_Register: case MachineOperand::MO_Immediate: case MachineOperand::MO_CImmediate: case MachineOperand::MO_FPImmediate: case MachineOperand::MO_ShuffleMask: return false; case MachineOperand::MO_MachineBasicBlock: return true; case MachineOperand::MO_FrameIndex: return false; case MachineOperand::MO_ConstantPoolIndex: case MachineOperand::MO_TargetIndex: case MachineOperand::MO_JumpTableIndex: case MachineOperand::MO_ExternalSymbol: case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_BlockAddress: return true; case MachineOperand::MO_RegisterMask: case MachineOperand::MO_RegisterLiveOut: return false; case MachineOperand::MO_Metadata: case MachineOperand::MO_MCSymbol: return true; case MachineOperand::MO_CFIIndex: return false; case MachineOperand::MO_IntrinsicID: case MachineOperand::MO_Predicate: llvm_unreachable("should not exist post-isel"); } llvm_unreachable("unhandled machine operand type"); } static MachineOperand makeImplicit(const MachineOperand &MO) { MachineOperand NewMO = MO; NewMO.setImplicit(); return NewMO; } void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO); MachineInstrBuilder LO16, HI16; LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); if (!STI->hasV6T2Ops() && (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) { // FIXME Windows CE supports older ARM CPUs assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+"); // Expand into a movi + orr. LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg); assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!"); unsigned ImmVal = (unsigned)MO.getImm(); unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); LO16.cloneMemRefs(MI); HI16.cloneMemRefs(MI); LO16.addImm(Pred).addReg(PredReg).add(condCodeOp()); HI16.addImm(Pred).addReg(PredReg).add(condCodeOp()); if (isCC) LO16.add(makeImplicit(MI.getOperand(1))); TransferImpOps(MI, LO16, HI16); MI.eraseFromParent(); return; } unsigned LO16Opc = 0; unsigned HI16Opc = 0; if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) { LO16Opc = ARM::t2MOVi16; HI16Opc = ARM::t2MOVTi16; } else { LO16Opc = ARM::MOVi16; HI16Opc = ARM::MOVTi16; } LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg); HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg); switch (MO.getType()) { case MachineOperand::MO_Immediate: { unsigned Imm = MO.getImm(); unsigned Lo16 = Imm & 0xffff; unsigned Hi16 = (Imm >> 16) & 0xffff; LO16 = LO16.addImm(Lo16); HI16 = HI16.addImm(Hi16); break; } case MachineOperand::MO_ExternalSymbol: { const char *ES = MO.getSymbolName(); unsigned TF = MO.getTargetFlags(); LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16); HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16); break; } default: { const GlobalValue *GV = MO.getGlobal(); unsigned TF = MO.getTargetFlags(); LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); break; } } LO16.cloneMemRefs(MI); HI16.cloneMemRefs(MI); LO16.addImm(Pred).addReg(PredReg); HI16.addImm(Pred).addReg(PredReg); if (RequiresBundling) finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator()); if (isCC) LO16.add(makeImplicit(MI.getOperand(1))); TransferImpOps(MI, LO16, HI16); MI.eraseFromParent(); LLVM_DEBUG(dbgs() << "To: "; LO16.getInstr()->dump();); LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump();); } /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as /// possible. This only gets used at -O0 so we don't care about efficiency of /// the generated code. bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, unsigned UxtOp, MachineBasicBlock::iterator &NextMBBI) { bool IsThumb = STI->isThumb(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Dest = MI.getOperand(0); Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); Register AddrReg = MI.getOperand(2).getReg(); Register DesiredReg = MI.getOperand(3).getReg(); Register NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoadCmpBB); MF->insert(++LoadCmpBB->getIterator(), StoreBB); MF->insert(++StoreBB->getIterator(), DoneBB); if (UxtOp) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg) .addReg(DesiredReg, RegState::Kill); if (!IsThumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); } // .Lloadcmp: // ldrex rDest, [rAddr] // cmp rDest, rDesired // bne .Ldone MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg()); MIB.addReg(AddrReg); if (LdrexOp == ARM::t2LDREX) MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) .addReg(DesiredReg) .add(predOps(ARMCC::AL)); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; BuildMI(LoadCmpBB, DL, TII->get(Bcc)) .addMBB(DoneBB) .addImm(ARMCC::NE) .addReg(ARM::CPSR, RegState::Kill); LoadCmpBB->addSuccessor(DoneBB); LoadCmpBB->addSuccessor(StoreBB); // .Lstore: // strex rTempReg, rNew, [rAddr] // cmp rTempReg, #0 // bne .Lloadcmp MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), TempReg) .addReg(NewReg) .addReg(AddrReg); if (StrexOp == ARM::t2STREX) MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) .addReg(TempReg, RegState::Kill) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) .addMBB(LoadCmpBB) .addImm(ARMCC::NE) .addReg(ARM::CPSR, RegState::Kill); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); // Recompute livein lists. LivePhysRegs LiveRegs; computeAndAddLiveIns(LiveRegs, *DoneBB); computeAndAddLiveIns(LiveRegs, *StoreBB); computeAndAddLiveIns(LiveRegs, *LoadCmpBB); // Do an extra pass around the loop to get loop carried registers right. StoreBB->clearLiveIns(); computeAndAddLiveIns(LiveRegs, *StoreBB); LoadCmpBB->clearLiveIns(); computeAndAddLiveIns(LiveRegs, *LoadCmpBB); return true; } /// ARM's ldrexd/strexd take a consecutive register pair (represented as a /// single GPRPair register), Thumb's take two separate registers so we need to /// extract the subregs from the pair. static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg, unsigned Flags, bool IsThumb, const TargetRegisterInfo *TRI) { if (IsThumb) { Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); MIB.addReg(RegLo, Flags); MIB.addReg(RegHi, Flags); } else MIB.addReg(Reg.getReg(), Flags); } /// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop. bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { bool IsThumb = STI->isThumb(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); Register AddrReg = MI.getOperand(2).getReg(); Register DesiredReg = MI.getOperand(3).getReg(); MachineOperand New = MI.getOperand(4); New.setIsKill(false); Register DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); Register DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); Register DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); Register DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoadCmpBB); MF->insert(++LoadCmpBB->getIterator(), StoreBB); MF->insert(++StoreBB->getIterator(), DoneBB); // .Lloadcmp: // ldrexd rDestLo, rDestHi, [rAddr] // cmp rDestLo, rDesiredLo // sbcs dead rTempReg, rDestHi, rDesiredHi // bne .Ldone unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD; MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD)); addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI); MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestLo, getKillRegState(Dest.isDead())) .addReg(DesiredLo) .add(predOps(ARMCC::AL)); BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestHi, getKillRegState(Dest.isDead())) .addReg(DesiredHi) .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; BuildMI(LoadCmpBB, DL, TII->get(Bcc)) .addMBB(DoneBB) .addImm(ARMCC::NE) .addReg(ARM::CPSR, RegState::Kill); LoadCmpBB->addSuccessor(DoneBB); LoadCmpBB->addSuccessor(StoreBB); // .Lstore: // strexd rTempReg, rNewLo, rNewHi, [rAddr] // cmp rTempReg, #0 // bne .Lloadcmp unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD; MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg); unsigned Flags = getKillRegState(New.isDead()); addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI); MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) .addReg(TempReg, RegState::Kill) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) .addMBB(LoadCmpBB) .addImm(ARMCC::NE) .addReg(ARM::CPSR, RegState::Kill); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); // Recompute livein lists. LivePhysRegs LiveRegs; computeAndAddLiveIns(LiveRegs, *DoneBB); computeAndAddLiveIns(LiveRegs, *StoreBB); computeAndAddLiveIns(LiveRegs, *LoadCmpBB); // Do an extra pass around the loop to get loop carried registers right. StoreBB->clearLiveIns(); computeAndAddLiveIns(LiveRegs, *StoreBB); LoadCmpBB->clearLiveIns(); computeAndAddLiveIns(LiveRegs, *LoadCmpBB); return true; } bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); switch (Opcode) { default: return false; case ARM::TCRETURNdi: case ARM::TCRETURNri: { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc dl = MBBI->getDebugLoc(); const ARMBaseInstrInfo &TII = *static_cast( MBB.getParent()->getSubtarget().getInstrInfo()); // Tail call return: adjust the stack pointer and jump to callee. MBBI = MBB.getLastNonDebugInstr(); MachineOperand &JumpTarget = MBBI->getOperand(0); // Jump to label or value in register. if (RetOpcode == ARM::TCRETURNdi) { unsigned TCOpcode = STI->isThumb() ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) : ARM::TAILJMPd; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); if (JumpTarget.isGlobal()) MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); else { assert(JumpTarget.isSymbol()); MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); } // Add the default predicate in Thumb mode. if (STI->isThumb()) MIB.add(predOps(ARMCC::AL)); } else if (RetOpcode == ARM::TCRETURNri) { unsigned Opcode = STI->isThumb() ? ARM::tTAILJMPr : (STI->hasV4TOps() ? ARM::TAILJMPr : ARM::TAILJMPr4); BuildMI(MBB, MBBI, dl, TII.get(Opcode)) .addReg(JumpTarget.getReg(), RegState::Kill); } auto NewMI = std::prev(MBBI); for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) NewMI->addOperand(MBBI->getOperand(i)); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); MBBI = NewMI; return true; } case ARM::VMOVScc: case ARM::VMOVDcc: { unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc), MI.getOperand(1).getReg()) .add(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) // 'pred' .add(MI.getOperand(4)) .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::t2MOVCCr: case ARM::MOVCCr: { unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), MI.getOperand(1).getReg()) .add(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) // 'pred' .add(MI.getOperand(4)) .add(condCodeOp()) // 's' bit .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::MOVCCsi: { BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), (MI.getOperand(1).getReg())) .add(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) .addImm(MI.getOperand(4).getImm()) // 'pred' .add(MI.getOperand(5)) .add(condCodeOp()) // 's' bit .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::MOVCCsr: { BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr), (MI.getOperand(1).getReg())) .add(MI.getOperand(2)) .add(MI.getOperand(3)) .addImm(MI.getOperand(4).getImm()) .addImm(MI.getOperand(5).getImm()) // 'pred' .add(MI.getOperand(6)) .add(condCodeOp()) // 's' bit .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::t2MOVCCi16: case ARM::MOVCCi16: { unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc), MI.getOperand(1).getReg()) .addImm(MI.getOperand(2).getImm()) .addImm(MI.getOperand(3).getImm()) // 'pred' .add(MI.getOperand(4)) .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::t2MOVCCi: case ARM::MOVCCi: { unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), MI.getOperand(1).getReg()) .addImm(MI.getOperand(2).getImm()) .addImm(MI.getOperand(3).getImm()) // 'pred' .add(MI.getOperand(4)) .add(condCodeOp()) // 's' bit .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::t2MVNCCi: case ARM::MVNCCi: { unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), MI.getOperand(1).getReg()) .addImm(MI.getOperand(2).getImm()) .addImm(MI.getOperand(3).getImm()) // 'pred' .add(MI.getOperand(4)) .add(condCodeOp()) // 's' bit .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::t2MOVCClsl: case ARM::t2MOVCClsr: case ARM::t2MOVCCasr: case ARM::t2MOVCCror: { unsigned NewOpc; switch (Opcode) { case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break; case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break; case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break; case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break; default: llvm_unreachable("unexpeced conditional move"); } BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc), MI.getOperand(1).getReg()) .add(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) .addImm(MI.getOperand(4).getImm()) // 'pred' .add(MI.getOperand(5)) .add(condCodeOp()) // 's' bit .add(makeImplicit(MI.getOperand(1))); MI.eraseFromParent(); return true; } case ARM::Int_eh_sjlj_dispatchsetup: { MachineFunction &MF = *MI.getParent()->getParent(); const ARMBaseInstrInfo *AII = static_cast(TII); const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); // For functions using a base pointer, we rematerialize it (via the frame // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it // for us. Otherwise, expand to nothing. if (RI.hasBasePointer(MF)) { int32_t NumBytes = AFI->getFramePtrSpillOffset(); Register FramePtr = RI.getFrameRegister(MF); assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) && "base pointer without frame pointer?"); if (AFI->isThumb2Function()) { emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, FramePtr, -NumBytes, ARMCC::AL, 0, *TII); } else if (AFI->isThumbFunction()) { emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, FramePtr, -NumBytes, *TII, RI); } else { emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, FramePtr, -NumBytes, ARMCC::AL, 0, *TII); } // If there's dynamic realignment, adjust for it. if (RI.needsStackRealignment(MF)) { MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned MaxAlign = MFI.getMaxAlignment(); assert (!AFI->isThumb1OnlyFunction()); // Emit bic r6, r6, MaxAlign assert(MaxAlign <= 256 && "The BIC instruction cannot encode " "immediates larger than 256 with all lower " "bits set."); unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6) .addReg(ARM::R6, RegState::Kill) .addImm(MaxAlign - 1) .add(predOps(ARMCC::AL)) .add(condCodeOp()); } } MI.eraseFromParent(); return true; } case ARM::MOVsrl_flag: case ARM::MOVsra_flag: { // These are just fancy MOVs instructions. BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), MI.getOperand(0).getReg()) .add(MI.getOperand(1)) .addImm(ARM_AM::getSORegOpc( (Opcode == ARM::MOVsrl_flag ? ARM_AM::lsr : ARM_AM::asr), 1)) .add(predOps(ARMCC::AL)) .addReg(ARM::CPSR, RegState::Define); MI.eraseFromParent(); return true; } case ARM::RRX: { // This encodes as "MOVs Rd, Rm, rrx MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), MI.getOperand(0).getReg()) .add(MI.getOperand(1)) .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); return true; } case ARM::tTPsoft: case ARM::TPsoft: { const bool Thumb = Opcode == ARM::tTPsoft; MachineInstrBuilder MIB; if (STI->genLongCalls()) { MachineFunction *MF = MBB.getParent(); MachineConstantPool *MCP = MF->getConstantPool(); unsigned PCLabelID = AFI->createPICLabelUId(); MachineConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(MF->getFunction().getContext(), "__aeabi_read_tp", PCLabelID, 0); Register Reg = MI.getOperand(0).getReg(); MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); if (!Thumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Thumb ? ARM::tBLXr : ARM::BLX)); if (Thumb) MIB.add(predOps(ARMCC::AL)); MIB.addReg(Reg, RegState::Kill); } else { MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Thumb ? ARM::tBL : ARM::BL)); if (Thumb) MIB.add(predOps(ARMCC::AL)); MIB.addExternalSymbol("__aeabi_read_tp", 0); } MIB.cloneMemRefs(MI); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); return true; } case ARM::tLDRpci_pic: case ARM::t2LDRpci_pic: { unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) ? ARM::tLDRpci : ARM::t2LDRpci; Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg) .add(MI.getOperand(1)) .add(predOps(ARMCC::AL)); MIB1.cloneMemRefs(MI); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg) .add(MI.getOperand(2)); TransferImpOps(MI, MIB1, MIB2); MI.eraseFromParent(); return true; } case ARM::LDRLIT_ga_abs: case ARM::LDRLIT_ga_pcrel: case ARM::LDRLIT_ga_pcrel_ldr: case ARM::tLDRLIT_ga_abs: case ARM::tLDRLIT_ga_pcrel: { Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); auto Flags = MO1.getTargetFlags(); const GlobalValue *GV = MO1.getGlobal(); bool IsARM = Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs; bool IsPIC = Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs; unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci; unsigned PICAddOpc = IsARM ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) : ARM::tPICADD; // We need a new const-pool entry to load from. MachineConstantPool *MCP = MBB.getParent()->getConstantPool(); unsigned ARMPCLabelIndex = 0; MachineConstantPoolValue *CPV; if (IsPIC) { unsigned PCAdj = IsARM ? 8 : 4; auto Modifier = (Flags & ARMII::MO_GOT) ? ARMCP::GOT_PREL : ARMCP::no_modifier; ARMPCLabelIndex = AFI->createPICLabelUId(); CPV = ARMConstantPoolConstant::Create( GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, Modifier, /*AddCurrentAddr*/ Modifier == ARMCP::GOT_PREL); } else CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg) .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); if (IsARM) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); if (IsPIC) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg) .addImm(ARMPCLabelIndex); if (IsARM) MIB.add(predOps(ARMCC::AL)); } MI.eraseFromParent(); return true; } case ARM::MOV_ga_pcrel: case ARM::MOV_ga_pcrel_ldr: case ARM::t2MOV_ga_pcrel: { // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode. unsigned LabelId = AFI->createPICLabelUId(); Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); const GlobalValue *GV = MO1.getGlobal(); unsigned TF = MO1.getTargetFlags(); bool isARM = Opcode != ARM::t2MOV_ga_pcrel; unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel; unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel; unsigned LO16TF = TF | ARMII::MO_LO16; unsigned HI16TF = TF | ARMII::MO_HI16; unsigned PICAddOpc = isARM ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) : ARM::tPICADD; MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg) .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF) .addImm(LabelId); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg) .addReg(DstReg) .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF) .addImm(LabelId); MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg).addImm(LabelId); if (isARM) { MIB3.add(predOps(ARMCC::AL)); if (Opcode == ARM::MOV_ga_pcrel_ldr) MIB3.cloneMemRefs(MI); } TransferImpOps(MI, MIB1, MIB3); MI.eraseFromParent(); return true; } case ARM::MOVi32imm: case ARM::MOVCCi32imm: case ARM::t2MOVi32imm: case ARM::t2MOVCCi32imm: ExpandMOV32BitImm(MBB, MBBI); return true; case ARM::SUBS_PC_LR: { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC) .addReg(ARM::LR) .add(MI.getOperand(0)) .add(MI.getOperand(1)) .add(MI.getOperand(2)) .addReg(ARM::CPSR, RegState::Undef); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); return true; } case ARM::VLDMQIA: { unsigned NewOpc = ARM::VLDMDIA; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); unsigned OpIdx = 0; // Grab the Q register destination. bool DstIsDead = MI.getOperand(OpIdx).isDead(); Register DstReg = MI.getOperand(OpIdx++).getReg(); // Copy the source register. MIB.add(MI.getOperand(OpIdx++)); // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Add the destination operands (D subregs). Register D0 = TRI->getSubReg(DstReg, ARM::dsub_0); Register D1 = TRI->getSubReg(DstReg, ARM::dsub_1); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); MIB.cloneMemRefs(MI); MI.eraseFromParent(); return true; } case ARM::VSTMQIA: { unsigned NewOpc = ARM::VSTMDIA; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); unsigned OpIdx = 0; // Grab the Q register source. bool SrcIsKill = MI.getOperand(OpIdx).isKill(); Register SrcReg = MI.getOperand(OpIdx++).getReg(); // Copy the destination register. MachineOperand Dst(MI.getOperand(OpIdx++)); MIB.add(Dst); // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); // Add the source operands (D subregs). Register D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); Register D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0) .addReg(D1, SrcIsKill ? RegState::Kill : 0); if (SrcIsKill) // Add an implicit kill for the Q register. MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); MIB.cloneMemRefs(MI); MI.eraseFromParent(); return true; } case ARM::VLD2q8Pseudo: case ARM::VLD2q16Pseudo: case ARM::VLD2q32Pseudo: case ARM::VLD2q8PseudoWB_fixed: case ARM::VLD2q16PseudoWB_fixed: case ARM::VLD2q32PseudoWB_fixed: case ARM::VLD2q8PseudoWB_register: case ARM::VLD2q16PseudoWB_register: case ARM::VLD2q32PseudoWB_register: case ARM::VLD3d8Pseudo: case ARM::VLD3d16Pseudo: case ARM::VLD3d32Pseudo: case ARM::VLD1d8TPseudo: case ARM::VLD1d16TPseudo: case ARM::VLD1d32TPseudo: case ARM::VLD1d64TPseudo: case ARM::VLD1d64TPseudoWB_fixed: case ARM::VLD1d64TPseudoWB_register: case ARM::VLD3d8Pseudo_UPD: case ARM::VLD3d16Pseudo_UPD: case ARM::VLD3d32Pseudo_UPD: case ARM::VLD3q8Pseudo_UPD: case ARM::VLD3q16Pseudo_UPD: case ARM::VLD3q32Pseudo_UPD: case ARM::VLD3q8oddPseudo: case ARM::VLD3q16oddPseudo: case ARM::VLD3q32oddPseudo: case ARM::VLD3q8oddPseudo_UPD: case ARM::VLD3q16oddPseudo_UPD: case ARM::VLD3q32oddPseudo_UPD: case ARM::VLD4d8Pseudo: case ARM::VLD4d16Pseudo: case ARM::VLD4d32Pseudo: case ARM::VLD1d8QPseudo: case ARM::VLD1d16QPseudo: case ARM::VLD1d32QPseudo: case ARM::VLD1d64QPseudo: case ARM::VLD1d64QPseudoWB_fixed: case ARM::VLD1d64QPseudoWB_register: case ARM::VLD1q8HighQPseudo: case ARM::VLD1q8LowQPseudo_UPD: case ARM::VLD1q8HighTPseudo: case ARM::VLD1q8LowTPseudo_UPD: case ARM::VLD1q16HighQPseudo: case ARM::VLD1q16LowQPseudo_UPD: case ARM::VLD1q16HighTPseudo: case ARM::VLD1q16LowTPseudo_UPD: case ARM::VLD1q32HighQPseudo: case ARM::VLD1q32LowQPseudo_UPD: case ARM::VLD1q32HighTPseudo: case ARM::VLD1q32LowTPseudo_UPD: case ARM::VLD1q64HighQPseudo: case ARM::VLD1q64LowQPseudo_UPD: case ARM::VLD1q64HighTPseudo: case ARM::VLD1q64LowTPseudo_UPD: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: case ARM::VLD4q8Pseudo_UPD: case ARM::VLD4q16Pseudo_UPD: case ARM::VLD4q32Pseudo_UPD: case ARM::VLD4q8oddPseudo: case ARM::VLD4q16oddPseudo: case ARM::VLD4q32oddPseudo: case ARM::VLD4q8oddPseudo_UPD: case ARM::VLD4q16oddPseudo_UPD: case ARM::VLD4q32oddPseudo_UPD: case ARM::VLD3DUPd8Pseudo: case ARM::VLD3DUPd16Pseudo: case ARM::VLD3DUPd32Pseudo: case ARM::VLD3DUPd8Pseudo_UPD: case ARM::VLD3DUPd16Pseudo_UPD: case ARM::VLD3DUPd32Pseudo_UPD: case ARM::VLD4DUPd8Pseudo: case ARM::VLD4DUPd16Pseudo: case ARM::VLD4DUPd32Pseudo: case ARM::VLD4DUPd8Pseudo_UPD: case ARM::VLD4DUPd16Pseudo_UPD: case ARM::VLD4DUPd32Pseudo_UPD: case ARM::VLD2DUPq8EvenPseudo: case ARM::VLD2DUPq8OddPseudo: case ARM::VLD2DUPq16EvenPseudo: case ARM::VLD2DUPq16OddPseudo: case ARM::VLD2DUPq32EvenPseudo: case ARM::VLD2DUPq32OddPseudo: case ARM::VLD3DUPq8EvenPseudo: case ARM::VLD3DUPq8OddPseudo: case ARM::VLD3DUPq16EvenPseudo: case ARM::VLD3DUPq16OddPseudo: case ARM::VLD3DUPq32EvenPseudo: case ARM::VLD3DUPq32OddPseudo: case ARM::VLD4DUPq8EvenPseudo: case ARM::VLD4DUPq8OddPseudo: case ARM::VLD4DUPq16EvenPseudo: case ARM::VLD4DUPq16OddPseudo: case ARM::VLD4DUPq32EvenPseudo: case ARM::VLD4DUPq32OddPseudo: ExpandVLD(MBBI); return true; case ARM::VST2q8Pseudo: case ARM::VST2q16Pseudo: case ARM::VST2q32Pseudo: case ARM::VST2q8PseudoWB_fixed: case ARM::VST2q16PseudoWB_fixed: case ARM::VST2q32PseudoWB_fixed: case ARM::VST2q8PseudoWB_register: case ARM::VST2q16PseudoWB_register: case ARM::VST2q32PseudoWB_register: case ARM::VST3d8Pseudo: case ARM::VST3d16Pseudo: case ARM::VST3d32Pseudo: case ARM::VST1d8TPseudo: case ARM::VST1d16TPseudo: case ARM::VST1d32TPseudo: case ARM::VST1d64TPseudo: case ARM::VST3d8Pseudo_UPD: case ARM::VST3d16Pseudo_UPD: case ARM::VST3d32Pseudo_UPD: case ARM::VST1d64TPseudoWB_fixed: case ARM::VST1d64TPseudoWB_register: case ARM::VST3q8Pseudo_UPD: case ARM::VST3q16Pseudo_UPD: case ARM::VST3q32Pseudo_UPD: case ARM::VST3q8oddPseudo: case ARM::VST3q16oddPseudo: case ARM::VST3q32oddPseudo: case ARM::VST3q8oddPseudo_UPD: case ARM::VST3q16oddPseudo_UPD: case ARM::VST3q32oddPseudo_UPD: case ARM::VST4d8Pseudo: case ARM::VST4d16Pseudo: case ARM::VST4d32Pseudo: case ARM::VST1d8QPseudo: case ARM::VST1d16QPseudo: case ARM::VST1d32QPseudo: case ARM::VST1d64QPseudo: case ARM::VST4d8Pseudo_UPD: case ARM::VST4d16Pseudo_UPD: case ARM::VST4d32Pseudo_UPD: case ARM::VST1d64QPseudoWB_fixed: case ARM::VST1d64QPseudoWB_register: case ARM::VST1q8HighQPseudo: case ARM::VST1q8LowQPseudo_UPD: case ARM::VST1q8HighTPseudo: case ARM::VST1q8LowTPseudo_UPD: case ARM::VST1q16HighQPseudo: case ARM::VST1q16LowQPseudo_UPD: case ARM::VST1q16HighTPseudo: case ARM::VST1q16LowTPseudo_UPD: case ARM::VST1q32HighQPseudo: case ARM::VST1q32LowQPseudo_UPD: case ARM::VST1q32HighTPseudo: case ARM::VST1q32LowTPseudo_UPD: case ARM::VST1q64HighQPseudo: case ARM::VST1q64LowQPseudo_UPD: case ARM::VST1q64HighTPseudo: case ARM::VST1q64LowTPseudo_UPD: case ARM::VST4q8Pseudo_UPD: case ARM::VST4q16Pseudo_UPD: case ARM::VST4q32Pseudo_UPD: case ARM::VST4q8oddPseudo: case ARM::VST4q16oddPseudo: case ARM::VST4q32oddPseudo: case ARM::VST4q8oddPseudo_UPD: case ARM::VST4q16oddPseudo_UPD: case ARM::VST4q32oddPseudo_UPD: ExpandVST(MBBI); return true; case ARM::VLD1LNq8Pseudo: case ARM::VLD1LNq16Pseudo: case ARM::VLD1LNq32Pseudo: case ARM::VLD1LNq8Pseudo_UPD: case ARM::VLD1LNq16Pseudo_UPD: case ARM::VLD1LNq32Pseudo_UPD: case ARM::VLD2LNd8Pseudo: case ARM::VLD2LNd16Pseudo: case ARM::VLD2LNd32Pseudo: case ARM::VLD2LNq16Pseudo: case ARM::VLD2LNq32Pseudo: case ARM::VLD2LNd8Pseudo_UPD: case ARM::VLD2LNd16Pseudo_UPD: case ARM::VLD2LNd32Pseudo_UPD: case ARM::VLD2LNq16Pseudo_UPD: case ARM::VLD2LNq32Pseudo_UPD: case ARM::VLD3LNd8Pseudo: case ARM::VLD3LNd16Pseudo: case ARM::VLD3LNd32Pseudo: case ARM::VLD3LNq16Pseudo: case ARM::VLD3LNq32Pseudo: case ARM::VLD3LNd8Pseudo_UPD: case ARM::VLD3LNd16Pseudo_UPD: case ARM::VLD3LNd32Pseudo_UPD: case ARM::VLD3LNq16Pseudo_UPD: case ARM::VLD3LNq32Pseudo_UPD: case ARM::VLD4LNd8Pseudo: case ARM::VLD4LNd16Pseudo: case ARM::VLD4LNd32Pseudo: case ARM::VLD4LNq16Pseudo: case ARM::VLD4LNq32Pseudo: case ARM::VLD4LNd8Pseudo_UPD: case ARM::VLD4LNd16Pseudo_UPD: case ARM::VLD4LNd32Pseudo_UPD: case ARM::VLD4LNq16Pseudo_UPD: case ARM::VLD4LNq32Pseudo_UPD: case ARM::VST1LNq8Pseudo: case ARM::VST1LNq16Pseudo: case ARM::VST1LNq32Pseudo: case ARM::VST1LNq8Pseudo_UPD: case ARM::VST1LNq16Pseudo_UPD: case ARM::VST1LNq32Pseudo_UPD: case ARM::VST2LNd8Pseudo: case ARM::VST2LNd16Pseudo: case ARM::VST2LNd32Pseudo: case ARM::VST2LNq16Pseudo: case ARM::VST2LNq32Pseudo: case ARM::VST2LNd8Pseudo_UPD: case ARM::VST2LNd16Pseudo_UPD: case ARM::VST2LNd32Pseudo_UPD: case ARM::VST2LNq16Pseudo_UPD: case ARM::VST2LNq32Pseudo_UPD: case ARM::VST3LNd8Pseudo: case ARM::VST3LNd16Pseudo: case ARM::VST3LNd32Pseudo: case ARM::VST3LNq16Pseudo: case ARM::VST3LNq32Pseudo: case ARM::VST3LNd8Pseudo_UPD: case ARM::VST3LNd16Pseudo_UPD: case ARM::VST3LNd32Pseudo_UPD: case ARM::VST3LNq16Pseudo_UPD: case ARM::VST3LNq32Pseudo_UPD: case ARM::VST4LNd8Pseudo: case ARM::VST4LNd16Pseudo: case ARM::VST4LNd32Pseudo: case ARM::VST4LNq16Pseudo: case ARM::VST4LNq32Pseudo: case ARM::VST4LNd8Pseudo_UPD: case ARM::VST4LNd16Pseudo_UPD: case ARM::VST4LNd32Pseudo_UPD: case ARM::VST4LNq16Pseudo_UPD: case ARM::VST4LNq32Pseudo_UPD: ExpandLaneOp(MBBI); return true; case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true; case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true; case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true; case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true; case ARM::CMP_SWAP_8: if (STI->isThumb()) return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB, NextMBBI); else return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB, ARM::UXTB, NextMBBI); case ARM::CMP_SWAP_16: if (STI->isThumb()) return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH, NextMBBI); else return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH, NextMBBI); case ARM::CMP_SWAP_32: if (STI->isThumb()) return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0, NextMBBI); else return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI); case ARM::CMP_SWAP_64: return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI); - - case ARM::tBL_PUSHLR: - case ARM::BL_PUSHLR: { - const bool Thumb = Opcode == ARM::tBL_PUSHLR; - Register Reg = MI.getOperand(0).getReg(); - assert(Reg == ARM::LR && "expect LR register!"); - MachineInstrBuilder MIB; - if (Thumb) { - // push {lr} - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPUSH)) - .add(predOps(ARMCC::AL)) - .addReg(Reg); - - // bl __gnu_mcount_nc - MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tBL)); - } else { - // stmdb sp!, {lr} - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::STMDB_UPD)) - .addReg(ARM::SP, RegState::Define) - .addReg(ARM::SP) - .add(predOps(ARMCC::AL)) - .addReg(Reg); - - // bl __gnu_mcount_nc - MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL)); - } - MIB.cloneMemRefs(MI); - for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); - MI.eraseFromParent(); - return true; - } } } bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { bool Modified = false; MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); Modified |= ExpandMI(MBB, MBBI, NMBBI); MBBI = NMBBI; } return Modified; } bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) { STI = &static_cast(MF.getSubtarget()); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); AFI = MF.getInfo(); LLVM_DEBUG(dbgs() << "********** ARM EXPAND PSEUDO INSTRUCTIONS **********\n" << "********** Function: " << MF.getName() << '\n'); bool Modified = false; for (MachineBasicBlock &MBB : MF) Modified |= ExpandMBB(MBB); if (VerifyARMPseudo) MF.verify(this, "After expanding ARM pseudo instructions."); LLVM_DEBUG(dbgs() << "***************************************************\n"); return Modified; } /// createARMExpandPseudoPass - returns an instance of the pseudo instruction /// expansion pass. FunctionPass *llvm::createARMExpandPseudoPass() { return new ARMExpandPseudo(); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 63b8f2acd3a8..9bf44580192e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1,16622 +1,16578 @@ //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that ARM uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "ARMISelLowering.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "ARMPerfectShuffle.h" #include "ARMRegisterInfo.h" #include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "Utils/ARMBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include #include #include #include using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "arm-isel" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); STATISTIC(NumConstpoolPromoted, "Number of constants with their storage promoted into constant pools"); static cl::opt ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); static cl::opt EnableConstpoolPromotion( "arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false)); // FIXME: set to true by default once PR32780 is fixed static cl::opt ConstpoolPromotionMaxSize( "arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64)); static cl::opt ConstpoolPromotionMaxTotal( "arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); setOperationAction(ISD::STORE, VT, Promote); AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); } MVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::f64) setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); if (ElemTy == MVT::i32) { setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); } else { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); } setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); } // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); } // Neon does not support vector divide/remainder operations. setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPRRegClass); addTypeForNEON(VT, MVT::f64, MVT::v2i32); } void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPairRegClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } void ARMTargetLowering::setAllExpand(MVT VT) { for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) setOperationAction(Opc, VT, Expand); // We support these really simple operations even on types where all // the actual arithmetic has to be broken down into simpler // operations or turned into library calls. setOperationAction(ISD::BITCAST, VT, Legal); setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); setOperationAction(ISD::UNDEF, VT, Legal); } void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action) { setLoadExtAction(ISD::EXTLOAD, From, To, Action); setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); setLoadExtAction(ISD::SEXTLOAD, From, To, Action); } void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; for (auto VT : IntTypes) { addRegisterClass(VT, &ARM::QPRRegClass); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); } // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); } } const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; for (auto VT : FloatTypes) { addRegisterClass(VT, &ARM::QPRRegClass); if (!HasMVEFP) setAllExpand(VT); // These are legal or custom whether we have MVE.fp or not setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); } if (HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); // No native support for these. setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); } } // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; for (auto VT : LongTypes) { addRegisterClass(VT, &ARM::QPRRegClass); setAllExpand(VT); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); } // We can do bitwise operations on v2i64 vectors setOperationAction(ISD::AND, MVT::v2i64, Legal); setOperationAction(ISD::OR, MVT::v2i64, Legal); setOperationAction(ISD::XOR, MVT::v2i64, Legal); // It is legal to extload from v4i8 to v4i16 or v4i32. addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); // Some truncating stores are legal too. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); // Pre and Post inc on these are legal, given the correct extends for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::v8i8, Legal); setIndexedStoreAction(im, MVT::v8i8, Legal); setIndexedLoadAction(im, MVT::v4i8, Legal); setIndexedStoreAction(im, MVT::v4i8, Legal); setIndexedLoadAction(im, MVT::v4i16, Legal); setIndexedStoreAction(im, MVT::v4i16, Legal); } // Predicate types const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; for (auto VT : pTypes) { addRegisterClass(VT, &ARM::VCCRRegClass); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); } } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { RegInfo = Subtarget->getRegisterInfo(); Itins = Subtarget->getInstrItineraryData(); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && !Subtarget->isTargetWatchOS()) { bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) setLibcallCallingConv(static_cast(LCID), IsHFTarget ? CallingConv::ARM_AAPCS_VFP : CallingConv::ARM_AAPCS); } if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const ISD::CondCode Cond; } LibraryCalls[] = { // Single-precision floating-point arithmetic. { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, // Double-precision floating-point arithmetic. { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, // Single-precision comparisons. { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, // Double-precision comparisons. { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, // Floating-point to integer conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, // Conversions between floating types. { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, // Integer to floating-point conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. // FIXME: There appears to be some naming inconsistency in ARM libgcc: // e.g., __floatunsidf vs. __floatunssidfvfp. { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } } } // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); // RTLIB if (Subtarget->isAAPCS_ABI() && (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; const ISD::CondCode Cond; } LibraryCalls[] = { // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Double-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 3 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Single-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 4 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Single-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 5 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Floating-point to integer conversions. // RTABI chapter 4.1.2, Table 6 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Conversions between floating types. // RTABI chapter 4.1.2, Table 7 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Integer to floating-point conversions. // RTABI chapter 4.1.2, Table 8 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Long long helper functions // RTABI chapter 4.2, Table 9 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Integer division functions // RTABI chapter 4.3.1 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } // EABI dependent RTLIB if (TM.Options.EABIVersion == EABI::EABI4 || TM.Options.EABIVersion == EABI::EABI5) { static const struct { const RTLIB::Libcall Op; const char *const Name; const CallingConv::ID CC; const ISD::CondCode Cond; } MemOpsLibraryCalls[] = { // Memory operations // RTABI chapter 4.3.4 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : MemOpsLibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } } } if (Subtarget->isTargetWindows()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } // Use divmod compiler-rt calls for iOS 5.0 and later. if (Subtarget->isTargetMachO() && !(Subtarget->isTargetIOS() && Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } // The half <-> float conversion functions are always soft-float on // non-watchos platforms, but are needed for some targets which use a // hard-float calling convention by default. if (!Subtarget->isTargetWatchABI()) { if (Subtarget->isAAPCS_ABI()) { setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); } else { setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); } } // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have // a __gnu_ prefix (which is the default). if (Subtarget->isTargetAEABI()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else addRegisterClass(MVT::i32, &ARM::GPRRegClass); if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && Subtarget->hasFPRegs()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); if (!Subtarget->hasVFP2Base()) setAllExpand(MVT::f32); if (!Subtarget->hasFP64()) setAllExpand(MVT::f64); } if (Subtarget->hasFullFP16()) { addRegisterClass(MVT::f16, &ARM::HPRRegClass); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::i32, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); addAllExtLoads(VT, InnerVT, Expand); } setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); if (Subtarget->hasMVEIntegerOps()) addMVEVectorTypes(Subtarget->hasMVEFloatOps()); // Combine low-overhead loop intrinsics so that we can lower i1 types. if (Subtarget->hasLOB()) { setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BR_CC); } if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); if (Subtarget->hasFullFP16()) { addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } } if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { // v2f64 is legal so that QR subregs can be extracted as f64 elements, but // none of Neon, MVE or VFP supports any arithmetic operations on it. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); // FIXME: Code duplication: FDIV and FREM are expanded always, see // ARMTargetLowering::addTypeForNEON method for details. setOperationAction(ISD::FDIV, MVT::v2f64, Expand); setOperationAction(ISD::FREM, MVT::v2f64, Expand); // FIXME: Create unittest. // In another words, find a way when "copysign" appears in DAG with vector // operands. setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); // FIXME: Code duplication: SETCC has custom operation action, see // ARMTargetLowering::addTypeForNEON method for details. setOperationAction(ISD::SETCC, MVT::v2f64, Expand); // FIXME: Create unittest for FNEG and for FABS. setOperationAction(ISD::FNEG, MVT::v2f64, Expand); setOperationAction(ISD::FABS, MVT::v2f64, Expand); setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); setOperationAction(ISD::FSIN, MVT::v2f64, Expand); setOperationAction(ISD::FCOS, MVT::v2f64, Expand); setOperationAction(ISD::FPOW, MVT::v2f64, Expand); setOperationAction(ISD::FLOG, MVT::v2f64, Expand); setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); setOperationAction(ISD::FEXP, MVT::v2f64, Expand); setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); setOperationAction(ISD::FRINT, MVT::v2f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); } if (Subtarget->hasNEON()) { // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively // supported for v4f32. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); setOperationAction(ISD::FPOW, MVT::v4f32, Expand); setOperationAction(ISD::FLOG, MVT::v4f32, Expand); setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); setOperationAction(ISD::FEXP, MVT::v4f32, Expand); setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); setOperationAction(ISD::FRINT, MVT::v4f32, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); // Mark v2f32 intrinsics. setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); setOperationAction(ISD::FSIN, MVT::v2f32, Expand); setOperationAction(ISD::FCOS, MVT::v2f32, Expand); setOperationAction(ISD::FPOW, MVT::v2f32, Expand); setOperationAction(ISD::FLOG, MVT::v2f32, Expand); setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); setOperationAction(ISD::FEXP, MVT::v2f32, Expand); setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); setOperationAction(ISD::FRINT, MVT::v2f32, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); // Custom handling for some quad-vector types to detect VMULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Custom handling for some vector types to avoid expensive expansions setOperationAction(ISD::SDIV, MVT::v4i16, Custom); setOperationAction(ISD::SDIV, MVT::v8i8, Custom); setOperationAction(ISD::UDIV, MVT::v4i16, Custom); setOperationAction(ISD::UDIV, MVT::v8i8, Custom); // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with // a destination type that is wider than the source, and nor does // it have a FP_TO_[SU]INT instruction with a narrower destination than // source. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); // NEON does not have single instruction CTTZ for vectors. setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); setOperationAction(ISD::FMA, MVT::v4f32, Expand); } setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, MVT::v2i32}) { for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); } } } if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); } if (!Subtarget->hasFP64()) { // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, // loads and stores are provided by the hardware. setOperationAction(ISD::FADD, MVT::f64, Expand); setOperationAction(ISD::FSUB, MVT::f64, Expand); setOperationAction(ISD::FMUL, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FDIV, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); setOperationAction(ISD::FNEG, MVT::f64, Expand); setOperationAction(ISD::FABS, MVT::f64, Expand); setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FLOG, MVT::f64, Expand); setOperationAction(ISD::FLOG2, MVT::f64, Expand); setOperationAction(ISD::FLOG10, MVT::f64, Expand); setOperationAction(ISD::FEXP, MVT::f64, Expand); setOperationAction(ISD::FEXP2, MVT::f64, Expand); setOperationAction(ISD::FCEIL, MVT::f64, Expand); setOperationAction(ISD::FTRUNC, MVT::f64, Expand); setOperationAction(ISD::FRINT, MVT::f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); setOperationAction(ISD::FFLOOR, MVT::f64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); } if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); if (Subtarget->hasFullFP16()) setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); } if (!Subtarget->hasFP16()) setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); if (!Subtarget->hasFP64()) setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); } // ... or truncating stores setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); // ARM does not have i1 sign extending load. for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // ARM supports all 4 flavors of integer indexed load / store. if (!Subtarget->isThumb1Only()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i1, Legal); setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i1, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); } } else { // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); } setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); // i64 operation support. setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i32, Expand); if (Subtarget->isThumb1Only()) { setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); } if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() || (Subtarget->isThumb2() && !Subtarget->hasDSP())) setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i64, Custom); setOperationAction(ISD::SRA, MVT::i64, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl // assuming that ISD::SRL and SRA of i64 are already marked custom if (Subtarget->hasMVEIntegerOps()) setOperationAction(ISD::SHL, MVT::i64, Custom); // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. if (Subtarget->isThumb1Only()) { setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); } if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); // ARM does not have ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { setOperationAction(ISD::CTLZ, MVT::i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); } // @llvm.readcyclecounter requires the Performance Monitors extension. // Default to the 0 expansion on unsupported platforms. // FIXME: Technically there are older ARM CPUs that have // implementation-specific ways of obtaining this information. if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); if (!hasDivide) { // These are expanded into libcalls if the cpu doesn't have HW divider. setOperationAction(ISD::SDIV, MVT::i32, LibCall); setOperationAction(ISD::UDIV, MVT::i32, LibCall); } if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { setOperationAction(ISD::SDIV, MVT::i32, Custom); setOperationAction(ISD::UDIV, MVT::i32, Custom); setOperationAction(ISD::SDIV, MVT::i64, Custom); setOperationAction(ISD::UDIV, MVT::i64, Custom); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || Subtarget->isTargetWindows()) { setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); HasStandaloneRem = false; if (Subtarget->isTargetWindows()) { const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } else { const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } setOperationAction(ISD::SDIVREM, MVT::i32, Custom); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); setOperationAction(ISD::SDIVREM, MVT::i64, Custom); setOperationAction(ISD::UDIVREM, MVT::i64, Custom); } else { setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); } if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) for (auto &VT : {MVT::f32, MVT::f64}) setOperationAction(ISD::FPOWI, VT, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Expand); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. InsertFencesForAtomic = false; if (Subtarget->hasAnyDataBarrier() && (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { // ATOMIC_FENCE needs custom lowering; the others should have been expanded // to ldrex/strex loops already. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); if (!Subtarget->isThumb() || !Subtarget->isMClass()) setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); // On v8, we have particularly efficient implementations of atomic fences // if they can be combined with nearby atomic loads and stores. if (!Subtarget->hasAcquireRelease() || getTargetMachine().getOptLevel() == 0) { // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. InsertFencesForAtomic = true; } } else { // If there's anything we can use as a barrier, go through custom lowering // for ATOMIC_FENCE. // If target has DMB in thumb, Fences can be inserted. if (Subtarget->hasDataBarrier()) InsertFencesForAtomic = true; setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Subtarget->hasAnyDataBarrier() ? Custom : Expand); // Set them all for expansion, which will force libcalls. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the // Unordered/Monotonic case. if (!InsertFencesForAtomic) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); } } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. if (!Subtarget->hasV6Ops()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. setOperationAction(ISD::BITCAST, MVT::i64, Custom); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (Subtarget->useSjLjEH()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::SETCC, MVT::f64, Expand); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SETCC, MVT::f16, Expand); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); } setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); if (Subtarget->hasFullFP16()) setOperationAction(ISD::BR_CC, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Custom); // We don't support sin/cos/fmod/copysign/pow setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); } setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); } // Various VFP goodness if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } } // Use __sincos_stret if available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); if (Subtarget->hasNEON()) { setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); } if (Subtarget->hasFP64()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); } } // FP16 often need to be promoted to call lib functions if (Subtarget->hasFullFP16()) { setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); setOperationAction(ISD::FSIN, MVT::f16, Promote); setOperationAction(ISD::FCOS, MVT::f16, Promote); setOperationAction(ISD::FSINCOS, MVT::f16, Promote); setOperationAction(ISD::FPOWI, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::f16, Promote); setOperationAction(ISD::FEXP, MVT::f16, Promote); setOperationAction(ISD::FEXP2, MVT::f16, Promote); setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Legal); } if (Subtarget->hasNEON()) { // vmin and vmax aren't available in a scalar form, so we use // a NEON instruction with an undef lane instead. setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); } } // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); if (Subtarget->isThumb1Only()) setTargetDAGCombine(ISD::SHL); setStackPointerRegisterToSaveRestore(ARM::SP); if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) setSchedulingPreference(Sched::RegPressure); else setSchedulingPreference(Sched::Hybrid); //// temporary - rewrite interface to use type MaxStoresPerMemset = 8; MaxStoresPerMemsetOptSize = 4; MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 2; MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. setMinStackArgumentAlignment(4); // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); if (Subtarget->isThumb() || Subtarget->isThumb2()) setTargetDAGCombine(ISD::ABS); } bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, // SPR's representative would be DPR_VFP2. This should work well if register // pressure tracking were modified such that a register use would increment the // pressure of the register class's representative and all of it's super // classes' representatives transitively. We have not implemented this because // of the difficulty prior to coalescing of modeling operand register classes // due to the common occurrence of cross class copies and subregister insertions // and extractions. std::pair ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); // Use DPR as representative register class for all floating point // and vector types. Since there are 32 SPR registers and 32 DPR registers so // the cost is 1 for both f32 and f64. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: RRC = &ARM::DPRRegClass; // When NEON is used for SP, only half of the register file is available // because operations that define both SP and DP results will be constrained // to the VFP2 class (D0-D15). We currently model this constraint prior to // coalescing by double-counting the SP regs. See the FIXME above. if (Subtarget->useNEONForSinglePrecisionFP()) Cost = 2; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: RRC = &ARM::DPRRegClass; Cost = 2; break; case MVT::v4i64: RRC = &ARM::DPRRegClass; Cost = 4; break; case MVT::v8i64: RRC = &ARM::DPRRegClass; Cost = 8; break; } return std::make_pair(RRC, Cost); } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((ARMISD::NodeType)Opcode) { case ARMISD::FIRST_NUMBER: break; case ARMISD::Wrapper: return "ARMISD::Wrapper"; case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMN: return "ARMISD::CMN"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::SUBS: return "ARMISD::SUBS"; case ARMISD::SSAT: return "ARMISD::SSAT"; case ARMISD::USAT: return "ARMISD::USAT"; case ARMISD::ASRL: return "ARMISD::ASRL"; case ARMISD::LSRL: return "ARMISD::LSRL"; case ARMISD::LSLL: return "ARMISD::LSLL"; case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; case ARMISD::ADDC: return "ARMISD::ADDC"; case ARMISD::ADDE: return "ARMISD::ADDE"; case ARMISD::SUBC: return "ARMISD::SUBC"; case ARMISD::SUBE: return "ARMISD::SUBE"; case ARMISD::LSLS: return "ARMISD::LSLS"; case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; case ARMISD::VCMP: return "ARMISD::VCMP"; case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; case ARMISD::VSHLs: return "ARMISD::VSHLs"; case ARMISD::VSHLu: return "ARMISD::VSHLu"; case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; case ARMISD::VDUP: return "ARMISD::VDUP"; case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; case ARMISD::VEXT: return "ARMISD::VEXT"; case ARMISD::VREV64: return "ARMISD::VREV64"; case ARMISD::VREV32: return "ARMISD::VREV32"; case ARMISD::VREV16: return "ARMISD::VREV16"; case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; case ARMISD::SMULWB: return "ARMISD::SMULWB"; case ARMISD::SMULWT: return "ARMISD::SMULWT"; case ARMISD::SMLALD: return "ARMISD::SMLALD"; case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; case ARMISD::WLS: return "ARMISD::WLS"; case ARMISD::LE: return "ARMISD::LE"; case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; } return nullptr; } EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(DL); // MVE has a predicate register. if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } /// getRegClassFor - Return the register class that should be used for the /// specified value type. const TargetRegisterClass * ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { (void)isDivergent; // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive // MVE Q registers. if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) return &ARM::QQQQPRRegClass; } return TargetLowering::getRegClassFor(VT); } // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the // source/dest is aligned and the copy size is large enough. We therefore want // to align such objects passed to memory intrinsics. bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const { if (!isa(CI)) return false; MinSize = 8; // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 // cycle faster than 4-byte aligned LDM. PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); return true; } // Create a fast isel object. FastISel * ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return ARM::createFastISel(funcInfo, libInfo); } Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { unsigned NumVals = N->getNumValues(); if (!NumVals) return Sched::RegPressure; for (unsigned i = 0; i != NumVals; ++i) { EVT VT = N->getValueType(i); if (VT == MVT::Glue || VT == MVT::Other) continue; if (VT.isFloatingPoint() || VT.isVector()) return Sched::ILP; } if (!N->isMachineOpcode()) return Sched::RegPressure; // Load are scheduled for latency even if there instruction itinerary // is not available. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); if (MCID.getNumDefs() == 0) return Sched::RegPressure; if (!Itins->isEmpty() && Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) return Sched::ILP; return Sched::RegPressure; } //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// static bool isSRL16(const SDValue &Op) { if (Op.getOpcode() != ISD::SRL) return false; if (auto Const = dyn_cast(Op.getOperand(1))) return Const->getZExtValue() == 16; return false; } static bool isSRA16(const SDValue &Op) { if (Op.getOpcode() != ISD::SRA) return false; if (auto Const = dyn_cast(Op.getOperand(1))) return Const->getZExtValue() == 16; return false; } static bool isSHL16(const SDValue &Op) { if (Op.getOpcode() != ISD::SHL) return false; if (auto Const = dyn_cast(Op.getOperand(1))) return Const->getZExtValue() == 16; return false; } // Check for a signed 16-bit value. We special case SRA because it makes it // more simple when also looking for SRAs that aren't sign extending a // smaller value. Without the check, we'd need to take extra care with // checking order for some operations. static bool isS16(const SDValue &Op, SelectionDAG &DAG) { if (isSRA16(Op)) return isSHL16(Op.getOperand(0)); return DAG.ComputeNumSignBits(Op) == 17; } /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return ARMCC::NE; case ISD::SETEQ: return ARMCC::EQ; case ISD::SETGT: return ARMCC::GT; case ISD::SETGE: return ARMCC::GE; case ISD::SETLT: return ARMCC::LT; case ISD::SETLE: return ARMCC::LE; case ISD::SETUGT: return ARMCC::HI; case ISD::SETUGE: return ARMCC::HS; case ISD::SETULT: return ARMCC::LO; case ISD::SETULE: return ARMCC::LS; } } /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { CondCode2 = ARMCC::AL; InvalidOnQNaN = true; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = ARMCC::EQ; InvalidOnQNaN = false; break; case ISD::SETGT: case ISD::SETOGT: CondCode = ARMCC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; case ISD::SETOLE: CondCode = ARMCC::LS; break; case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; InvalidOnQNaN = false; break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; InvalidOnQNaN = false; break; case ISD::SETUGT: CondCode = ARMCC::HI; break; case ISD::SETUGE: CondCode = ARMCC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = ARMCC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = ARMCC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = ARMCC::NE; InvalidOnQNaN = false; break; } } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// /// getEffectiveCallingConv - Get the effective calling convention, taking into /// account presence of floating point hardware and calling convention /// limitations, such as support for variadic functions. CallingConv::ID ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, bool isVarArg) const { switch (CC) { default: report_fatal_error("Unsupported calling convention"); case CallingConv::ARM_AAPCS: case CallingConv::ARM_APCS: case CallingConv::GHC: return CC; case CallingConv::PreserveMost: return CallingConv::PreserveMost; case CallingConv::ARM_AAPCS_VFP: case CallingConv::Swift: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && getTargetMachine().Options.FloatABIType == FloatABI::Hard && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; case CallingConv::Fast: case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; return CallingConv::ARM_APCS; } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; } } CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const { return CCAssignFnForNode(CC, false, isVarArg); } CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const { return CCAssignFnForNode(CC, true, isVarArg); } /// CCAssignFnForNode - Selects the correct CCAssignFn for the given /// CallingConvention. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const { switch (getEffectiveCallingConv(CC, isVarArg)) { default: report_fatal_error("Unsupported calling convention"); case CallingConv::ARM_APCS: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); case CallingConv::ARM_AAPCS: return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); case CallingConv::ARM_AAPCS_VFP: return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); case CallingConv::Fast: return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); case CallingConv::GHC: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); case CallingConv::PreserveMost: return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { // Assign locations to each value returned by this call. SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } SDValue Val; if (VA.needsCustom()) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); InFlag = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Hi.getValue(1); InFlag = Hi.getValue(2); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); if (VA.getLocVT() == MVT::v2f64) { SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, DAG.getConstant(0, dl, MVT::i32)); VA = RVLocs[++i]; // skip ahead to next loc Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); InFlag = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Hi.getValue(1); InFlag = Hi.getValue(2); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, DAG.getConstant(1, dl, MVT::i32)); } } else { Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } /// LowerMemOpCallTo - Store the argument to the stack. SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); unsigned id = Subtarget->isLittle() ? 0 : 1; RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); if (NextVA.isRegLoc()) RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); else { assert(NextVA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), dl, DAG, NextVA, Flags)); } } /// LowerCall - Lowering a call into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. SDValue ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool doesNotRet = CLI.DoesNotReturn; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); bool PreferIndirect = false; // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; if (isa(Callee)) { // If we're optimizing for minimum size and the function is called three or // more times in this block, we can improve codesize by calling indirectly // as BLXr has a 16-bit encoding. auto *GV = cast(Callee)->getGlobal(); if (CLI.CS) { auto *BB = CLI.CS.getParent(); PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && count_if(GV->users(), [&BB](const User *U) { return isa(U) && cast(U)->getParent() == BB; }) > 2; } } if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization( Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, PreferIndirect); if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) ++NumTailCalls; } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (isTailCall) { // For tail calls, memory operands are available in our caller's stack. NumBytes = 0; } else { // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); } SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); RegsToPassVector RegsToPass; SmallVector MemOpChains; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization, arguments are handled later. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } // f64 and v2f64 might be passed in i32 pairs and must be split into pieces if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, dl, MVT::i32)); PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); VA = ArgLocs[++i]; // skip ahead to next loc if (VA.isRegLoc()) { PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } else { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); } } else { PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { assert(VA.getLocVT() == MVT::i32 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i32 && "unexpected use of 'returned'"); isThisReturn = true; } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); unsigned offset = 0; // True if this byval aggregate will be split between registers // and memory. unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); if (CurByValIdx < ByValArgsCount) { unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } // If parameter size outsides register area, "offset" value // helps us to calculate stack slot for remained part properly. offset = RegEnd - RegBegin; CCInfo.nextInRegsParam(); } if (Flags.getByValSize() > 4*offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); unsigned LocMemOffset = VA.getLocMemOffset(); SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } } else if (!isTailCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. bool isDirect = false; const TargetMachine &TM = getTargetMachine(); const Module *Mod = MF.getFunction().getParent(); const GlobalValue *GV = nullptr; if (GlobalAddressSDNode *G = dyn_cast(Callee)) GV = G->getGlobal(); bool isStub = !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && "long-calls codegen is not position independent!"); // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. if (isa(Callee)) { // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } else if (isa(Callee)) { if (!PreferIndirect) { isDirect = true; bool isDef = GV->isStrongDefinitionForLinker(); // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); // tBX takes a register source operand. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode( ARMISD::WrapperPIC, dl, PtrVt, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), /* Alignment = */ 0, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), MachinePointerInfo::getGOT(DAG.getMachineFunction())); } else { Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); } } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; // tBX takes a register source operand. const char *Sym = S->getSymbol(); if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); } } // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = ARMISD::CALL; } else { if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && // Emit regular call when code size is the priority !Subtarget->hasMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. if (!isTailCall) { const uint32_t *Mask; const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); if (isThisReturn) { // For 'this' returns, use the R0-preserving mask if applicable Mask = ARI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { // Set isThisReturn to false if the calling convention is not one that // allows 'returned' to be modeled in this way, so LowerCallResult does // not try to pass 'this' straight through isThisReturn = false; Mask = ARI->getCallPreservedMask(MF, CallConv); } } else Mask = ARI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); } if (InFlag.getNode()) Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (isTailCall) { MF.getFrameInfo().setHasTailCall(); return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals, isThisReturn, isThisReturn ? OutVals[0] : SDValue()); } /// HandleByVal - Every parameter *after* a byval parameter is passed /// on the stack. Remember the next parameter register to allocate, /// and then confiscate the rest of the parameter registers to insure /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, unsigned Align) const { // Byval (as with any stack) slots are always at least 4 byte aligned. Align = std::max(Align, 4U); unsigned Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; unsigned AlignInRegs = Align / 4; unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; unsigned Excess = 4 * (ARM::R4 - Reg); // Special case when NSAA != SP and parameter size greater than size of // all remained GPR regs. In that case we can't split parameter, we must // send it to stack. We also must set NCRN to R4, so waste all // remained registers. const unsigned NSAAOffset = State->getNextStackOffset(); if (NSAAOffset != 0 && Size > Excess) { while (State->AllocateReg(GPRArgRegs)) ; return; } // First register for byval parameter is the first register that wasn't // allocated before this method call, so it would be "reg". // If parameter is small enough to be saved in range [reg, r4), then // the end (first after last) register would be reg + param-size-in-regs, // else parameter would be splitted between registers and stack, // end register would be r4 in this case. unsigned ByValRegBegin = Reg; unsigned ByValRegEnd = std::min(Reg + Size / 4, ARM::R4); State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); // Note, first register is allocated in the beginning of function already, // allocate remained amount of registers we need. for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) State->AllocateReg(GPRArgRegs); // A byval parameter that is split between registers and memory needs its // size truncated here. // In the case where the entire structure fits in registers, we set the // size in memory to zero. Size = std::max(Size - Excess, 0); } /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII) { unsigned Bytes = Arg.getValueSizeInBits() / 8; int FI = std::numeric_limits::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else return false; assert(FI != std::numeric_limits::max()); if (!MFI.isFixedObjectIndex(FI)) return false; return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool ARMTargetLowering::IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG, const bool isIndirect) const { MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); assert(Subtarget->supportsTailCall()); // Indirect tail calls cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. if (Subtarget->isThumb1Only() && Outs.size() >= 4 && (!isa(Callee.getNode()) || isIndirect)) return false; // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Exception-handling functions need a special set of instructions to indicate // a return to the hardware. Tail-calling another function would probably // break this. if (CallerF.hasFnAttribute("interrupt")) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // Check that the call results are passed in the same way. LLVMContext &C = *DAG.getContext(); if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, CCAssignFnForReturn(CalleeCC, isVarArg), CCAssignFnForReturn(CallerCC, isVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (CalleeCC != CallerCC) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } // If Caller's vararg or byval argument has been split between registers and // stack, do not perform tail call, since part of the argument is in caller's // local frame. const ARMFunctionInfo *AFI_Caller = MF.getInfo(); if (AFI_Caller->getArgRegsSaveSize()) return false; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (VA.needsCustom()) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure // out what to do about this. if (!VA.isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) return false; if (RegVT == MVT::v2f64) { if (!ArgLocs[++i].isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) return false; } } else if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; } return true; } bool ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); } static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, const SDLoc &DL, SelectionDAG &DAG) { const MachineFunction &MF = DAG.getMachineFunction(); const Function &F = MF.getFunction(); StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset // version of the "preferred return address". These offsets affect the return // instruction if this is a return from PL1 without hypervisor extensions. // IRQ/FIQ: +4 "subs pc, lr, #4" // SWI: 0 "subs pc, lr, #0" // ABORT: +4 "subs pc, lr, #4" // UNDEF: +4/+2 "subs pc, lr, #0" // UNDEF varies depending on where the exception came from ARM or Thumb // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. int64_t LROffset; if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || IntKind == "ABORT") LROffset = 4; else if (IntKind == "SWI" || IntKind == "UNDEF") LROffset = 0; else report_fatal_error("Unsupported interrupt attribute. If present, value " "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, DL, MVT::i32, false)); return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); } SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; // CCState - Info about the registers and stack slots. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); // Analyze outgoing return values. CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) bool isLittleEndian = Subtarget->isLittle(); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); AFI->setReturnRegsCount(RVLocs.size()); // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; bool ReturnF16 = false; if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { // Half-precision return values can be returned like this: // // t11 f16 = fadd ... // t12: i16 = bitcast t11 // t13: i32 = zero_extend t12 // t14: f32 = bitcast t13 <~~~~~~~ Arg // // to avoid code generation for bitcasts, we simply set Arg to the node // that produces the f16 value, t11 in this case. // if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { SDValue ZE = Arg.getOperand(0); if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { SDValue BC = ZE.getOperand(0); if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { Arg = BC.getOperand(0); ReturnF16 = true; } } } } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: if (!ReturnF16) Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc // Extract the 2nd half and fall through to handle it as an f64 value. Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, dl, MVT::i32)); } // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is // available. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); } else Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), ReturnF16 ? MVT::f16 : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (ARM::GPRRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i32)); else if (ARM::DPRRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } // Update chain and glue. RetOps[0] = Chain; if (Flag.getNode()) RetOps.push_back(Flag); // CPUs which aren't M-class use a special sequence to return from // exceptions (roughly, any instruction setting pc and cpsr simultaneously, // though we use "subs pc, lr, #N"). // // M-class CPUs actually use a normal return sequence with a special // (hardware-provided) value in LR, so the normal code path works. if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && !Subtarget->isMClass()) { if (Subtarget->isThumb1Only()) report_fatal_error("interrupt attribute is not supported in Thumb1"); return LowerInterruptReturn(RetOps, dl, DAG); } return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); } bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { SDNode *VMov = Copy; // f64 returned in a pair of GPRs. SmallPtrSet Copies; for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ISD::CopyToReg) return false; Copies.insert(*UI); } if (Copies.size() > 2) return false; for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { SDValue UseChain = UI->getOperand(0); if (Copies.count(UseChain.getNode())) // Second CopyToReg Copy = *UI; else { // We are at the top of this chain. // If the copy has a glue operand, we conservatively assume it // isn't safe to perform a tail call. if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) return false; // First CopyToReg TCChain = UseChain; } } } else if (Copy->getOpcode() == ISD::BITCAST) { // f32 returned in a single GPR. if (!Copy->hasOneUse()) return false; Copy = *Copy->use_begin(); if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) return false; // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else { return false; } bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ARMISD::RET_FLAG && UI->getOpcode() != ARMISD::INTRET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; return true; } // Trying to write a 64 bit value so need to split into two 32 bit values first, // and pass the lower and high parts through. static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue WriteValue = Op->getOperand(2); // This function is only supposed to be called for i64 type argument. assert(WriteValue.getValueType() == MVT::i64 && "LowerWRITE_REGISTER called for non-i64 type argument."); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, DAG.getConstant(0, DL, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, DAG.getConstant(1, DL, MVT::i32)); SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOVi. SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here SDLoc dl(Op); ConstantPoolSDNode *CP = cast(Op); SDValue Res; // When generating execute-only code Constant Pools must be promoted to the // global data section. It's a bit ugly that we can't share them across basic // blocks, but this way we guarantee that execute-only behaves correct with // position-independent addressing modes. if (Subtarget->genExecuteOnly()) { auto AFI = DAG.getMachineFunction().getInfo(); auto T = const_cast(CP->getType()); auto C = const_cast(CP->getConstVal()); auto M = const_cast(DAG.getMachineFunction(). getFunction().getParent()); auto GV = new GlobalVariable( *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + Twine(AFI->createPICLabelUId()) ); SDValue GA = DAG.getTargetGlobalAddress(dyn_cast(GV), dl, PtrVT); return LowerGlobalAddress(GA, DAG); } if (CP->isMachineConstantPoolEntry()) Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlignment()); else Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment()); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } unsigned ARMTargetLowering::getJumpTableEncoding() const { return MachineJumpTableInfo::EK_Inline; } SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; SDLoc DL(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast(Op)->getBlockAddress(); SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); if (!IsPositionIndependent) { CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); if (!IsPositionIndependent) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); } /// Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address for Darwin, and return an /// SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i32] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first word, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "r0". /// /// Since this descriptor may be in a different unit, in general access must /// proceed along the usual ARM rules. A common sequence to produce is: /// /// movw rT1, :lower16:_var$non_lazy_ptr /// movt rT1, :upper16:_var$non_lazy_ptr /// ldr r0, [rT1] /// ldr rT2, [r0] /// blx rT2 /// [...address now in r0...] SDValue ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "This function expects a Darwin target"); SDLoc DL(Op); // First step is to get the address of the actua global symbol. This is where // the TLS descriptor lives. SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i32, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), /* Alignment = */ 4, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); MachineFunction &F = DAG.getMachineFunction(); MachineFrameInfo &MFI = F.getFrameInfo(); MFI.setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be // silly). auto TRI = getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); auto ARI = static_cast(TRI); const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: r0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); Chain = DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); } SDValue ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); // Load the current TEB (thread environment block) SDValue Ops[] = {Chain, DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), DAG.getConstant(15, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(13, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(2, DL, MVT::i32)}; SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); SDValue TEB = CurrentTEB.getValue(0); Chain = CurrentTEB.getValue(1); // Load the ThreadLocalStoragePointer from the TEB // A pointer to the TLS array is located at offset 0x2c from the TEB. SDValue TLSArray = DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 // offset into the TLSArray. // Load the TLS index from the C runtime SDValue TLSIndex = DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(2, DL, MVT::i32)); SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), MachinePointerInfo()); // Get the offset of the start of the .tls section (section base) const auto *GA = cast(Op); auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); SDValue Offset = DAG.getLoad( PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, DAG.getTargetConstantPool(CPV, PtrVT, 4)), MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { SDLoc dl(GA); EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), Argument, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); // call __tls_get_addr. ArgListTy Args; ArgListEntry Entry; Entry.Node = Argument; Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); Args.push_back(Entry); // FIXME: is there useful debug info available here? TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( CallingConv::C, Type::getInt32Ty(*DAG.getContext()), DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } // Lower ISD::GlobalTLSAddress using the "initial exec" or // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); if (model == TLSModel::InitialExec) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); // Initial exec model. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else { // local exec model assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); if (Subtarget->isTargetDarwin()) return LowerGlobalTLSAddressDarwin(Op, DAG); if (Subtarget->isTargetWindows()) return LowerGlobalTLSAddressWindows(Op, DAG); // TODO: implement the "local dynamic" model assert(Subtarget->isTargetELF() && "Only ELF implemented here"); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); switch (model) { case TLSModel::GeneralDynamic: case TLSModel::LocalDynamic: return LowerToTLSGeneralDynamicModel(GA, DAG); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModels(GA, DAG, model); } llvm_unreachable("bogus TLS model"); } /// Return true if all users of V are within function F, looking through /// ConstantExprs. static bool allUsersAreInFunction(const Value *V, const Function *F) { SmallVector Worklist; for (auto *U : V->users()) Worklist.push_back(U); while (!Worklist.empty()) { auto *U = Worklist.pop_back_val(); if (isa(U)) { for (auto *UU : U->users()) Worklist.push_back(UU); continue; } auto *I = dyn_cast(U); if (!I || I->getParent()->getParent() != F) return false; } return true; } static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl) { // If we're creating a pool entry for a constant global with unnamed address, // and the global is small enough, we can emit it inline into the constant pool // to save ourselves an indirection. // // This is a win if the constant is only used in one function (so it doesn't // need to be duplicated) or duplicating the constant wouldn't increase code // size (implying the constant is no larger than 4 bytes). const Function &F = DAG.getMachineFunction().getFunction(); // We rely on this decision to inline being idemopotent and unrelated to the // use-site. We know that if we inline a variable at one use site, we'll // inline it elsewhere too (and reuse the constant pool entry). Fast-isel // doesn't know about this optimization, so bail out if it's enabled else // we could decide to inline here (and thus never emit the GV) but require // the GV from fast-isel generated code. if (!EnableConstpoolPromotion || DAG.getMachineFunction().getTarget().Options.EnableFastISel) return SDValue(); auto *GVar = dyn_cast(GV); if (!GVar || !GVar->hasInitializer() || !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || !GVar->hasLocalLinkage()) return SDValue(); // If we inline a value that contains relocations, we move the relocations // from .data to .text. This is not allowed in position-independent code. auto *Init = GVar->getInitializer(); if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && Init->needsRelocation()) return SDValue(); // The constant islands pass can only really deal with alignment requests // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote // any type wanting greater alignment requirements than 4 bytes. We also // can only promote constants that are multiples of 4 bytes in size or // are paddable to a multiple of 4. Currently we only try and pad constants // that are strings for simplicity. auto *CDAInit = dyn_cast(Init); unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || Size == 0) return SDValue(); unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); // We can't bloat the constant pool too much, else the ConstantIslands pass // may fail to converge. If we haven't promoted this global yet (it may have // multiple uses), and promoting it would increase the constant pool size (Sz // > 4), ensure we have space to do so up to MaxTotal. if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= ConstpoolPromotionMaxTotal) return SDValue(); // This is only valid if all users are in a single function; we can't clone // the constant in general. The LLVM IR unnamed_addr allows merging // constants, but not cloning them. // // We could potentially allow cloning if we could prove all uses of the // constant in the current function don't care about the address, like // printf format strings. But that isn't implemented for now. if (!allUsersAreInFunction(GVar, &F)) return SDValue(); // We're going to inline this global. Pad it out if needed. if (RequiredPadding != 4) { StringRef S = CDAInit->getAsString(); SmallVector V(S.size()); std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); while (RequiredPadding--) V.push_back(0); Init = ConstantDataArray::get(*DAG.getContext(), V); } auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { AFI->markGlobalAsPromotedToConstantPool(GVar); AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + PaddedSize - 4); } ++NumConstpoolPromoted; return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); } bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { if (const GlobalAlias *GA = dyn_cast(GV)) if (!(GV = GA->getBaseObject())) return false; if (const auto *V = dyn_cast(GV)) return V->isConstant(); return isa(GV); } SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { switch (Subtarget->getTargetTriple().getObjectFormat()) { default: llvm_unreachable("unknown object format"); case Triple::COFF: return LowerGlobalAddressWindows(Op, DAG); case Triple::ELF: return LowerGlobalAddressELF(Op, DAG); case Triple::MachO: return LowerGlobalAddressDarwin(Op, DAG); } } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); const TargetMachine &TM = getTargetMachine(); bool IsRO = isReadOnly(GV); // promoteToConstantPool only if not generating XO text section if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) return V; if (isPositionIndependent()) { bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, UseGOT_PREL ? ARMII::MO_GOT : 0); SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); if (UseGOT_PREL) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } else if (Subtarget->isROPI() && IsRO) { // PC-relative. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); return Result; } else if (Subtarget->isRWPI() && !IsRO) { // SB-relative. SDValue RelAddr; if (Subtarget->useMovt()) { ++NumMovwMovt; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); } else { // use literal pool for address constant ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); RelAddr = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); return Result; } // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. if (Subtarget->useMovt()) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Darwin"); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (Subtarget->useMovt()) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into multiple nodes unsigned Wrapper = isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); if (Subtarget->isGVIndirectSymbol(GV)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt"); assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Windows"); const TargetMachine &TM = getTargetMachine(); const GlobalValue *GV = cast(Op)->getGlobal(); ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; if (GV->hasDLLImportStorageClass()) TargetFlags = ARMII::MO_DLLIMPORT; else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) TargetFlags = ARMII::MO_COFFSTUB; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, TargetFlags)); if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Val = DAG.getConstant(0, dl, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1), Val); } SDValue ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); } SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, Op.getOperand(0)); } -SDValue ARMTargetLowering::LowerINTRINSIC_VOID( - SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { - unsigned IntNo = - cast( - Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) - ->getZExtValue(); - switch (IntNo) { - default: - return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::arm_gnu_eabi_mcount: { - MachineFunction &MF = DAG.getMachineFunction(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc dl(Op); - SDValue Chain = Op.getOperand(0); - // call "\01__gnu_mcount_nc" - const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); - const uint32_t *Mask = - ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); - assert(Mask && "Missing call preserved mask for calling convention"); - // Mark LR an implicit live-in. - unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); - SDValue ReturnAddress = - DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); - std::vector ResultTys = {MVT::Other, MVT::Glue}; - SDValue Callee = - DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); - SDValue RegisterMask = DAG.getRegisterMask(Mask); - if (Subtarget->isThumb()) - return SDValue( - DAG.getMachineNode( - ARM::tBL_PUSHLR, dl, ResultTys, - {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), - DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), - 0); - return SDValue( - DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, - {ReturnAddress, Callee, RegisterMask, Chain}), - 0); - } - } -} - SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent(); unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); if (IsPositionIndependent) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } return Result; } case Intrinsic::arm_neon_vabs: return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), Op.getOperand(1)); case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) ? ARMISD::VMULLs : ARMISD::VMULLu; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vminnm: case Intrinsic::arm_neon_vmaxnm: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) ? ISD::FMINNUM : ISD::FMAXNUM; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vminu: case Intrinsic::arm_neon_vmaxu: { if (Op.getValueType().isFloatingPoint()) return SDValue(); unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) ? ISD::UMIN : ISD::UMAX; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vmins: case Intrinsic::arm_neon_vmaxs: { // v{min,max}s is overloaded between signed integers and floats. if (!Op.getValueType().isFloatingPoint()) { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) ? ISD::SMIN : ISD::SMAX; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) ? ISD::FMINIMUM : ISD::FMAXIMUM; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vtbl1: return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::arm_neon_vtbl2: return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc dl(Op); ConstantSDNode *SSIDNode = cast(Op.getOperand(2)); auto SSID = static_cast(SSIDNode->getZExtValue()); if (SSID == SyncScope::SingleThread) return Op; if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(0, dl, MVT::i32)); } ConstantSDNode *OrdN = cast(Op.getOperand(1)); AtomicOrdering Ord = static_cast(OrdN->getZExtValue()); ARM_MB::MemBOpt Domain = ARM_MB::ISH; if (Subtarget->isMClass()) { // Only a full system barrier exists in the M-class architectures. Domain = ARM_MB::SY; } else if (Subtarget->preferISHSTBarriers() && Ord == AtomicOrdering::Release) { // Swift happens to implement ISHST barriers in a way that's compatible with // Release semantics but weaker than ISH so we'd be fools not to use // it. Beware: other processors probably don't! Domain = ARM_MB::ISHST; } return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), DAG.getConstant(Domain, dl, MVT::i32)); } static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // ARM pre v5TE and Thumb1 does not have preload instructions. if (!(Subtarget->isThumb2() || (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) // Just preserve the chain. return Op.getOperand(0); SDLoc dl(Op); unsigned isRead = ~cast(Op.getOperand(2))->getZExtValue() & 1; if (!isRead && (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) // ARMv7 with MP extension has PLDW. return Op.getOperand(0); unsigned isData = cast(Op.getOperand(4))->getZExtValue(); if (Subtarget->isThumb()) { // Invert the bits. isRead = ~isRead & 1; isData = ~isData & 1; } return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), DAG.getConstant(isData, dl, MVT::i32)); } static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *FuncInfo = MF.getInfo(); // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc dl(Op); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) RC = &ARM::tGPRRegClass; else RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; if (NextVA.isMemLoc()) { MachineFrameInfo &MFI = MF.getFrameInfo(); int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); ArgValue2 = DAG.getLoad( MVT::i32, dl, Root, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); } if (!Subtarget->isLittle()) std::swap (ArgValue, ArgValue2); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } // The remaining GPRs hold either the beginning of variable-argument // data, or the beginning of an aggregate passed by value (usually // byval). Either way, we allocate stack slots adjacent to the data // provided by our caller, and store the unallocated registers there. // If this is a variadic function, the va_list pointer will begin with // these values; otherwise, this reassembles a (byval) structure that // was split between registers and memory. // Return: The frame index registers were stored into. int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, int ArgOffset, unsigned ArgSize) const { // Currently, two use-cases possible: // Case #1. Non-var-args function, and we meet first byval parameter. // Setup first unallocated register as first byval register; // eat all remained registers // (these two actions are performed by HandleByVal method). // Then, here, we initialize stack frame with // "store-reg" instructions. // Case #2. Var-args function, that doesn't contain byval parameters. // The same: eat all remained unallocated registers, // initialize stack frame. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned RBegin, REnd; if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); } else { unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; REnd = ARM::R4; } if (REnd != RBegin) ArgOffset = -4 * (ARM::R4 - RBegin); auto PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); SmallVector MemOps; const TargetRegisterClass *RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { unsigned VReg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return FrameIndex; } // Setup stack frame, the va_list pointer will start from. void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); // Try to store any remaining integer argument regs // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. // If there is no regs to be stored, just point address after last // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(), CCInfo.getNextStackOffset(), std::max(4U, TotalArgRegsSaveSize)); AFI->setVarArgsFrameIndex(FrameIndex); } SDValue ARMTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); SmallVector ArgValues; SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; // Initially ArgRegsSaveSize is zero. // Then we increase this value each time we meet byval parameter. // We also increase this value in case of varargs function. AFI->setArgRegsSaveSize(0); // Calculate the amount of stack space that we need to allocate to store // byval and variadic arguments that are passed in registers. // We need to know this before we allocate the first byval or variadic // argument, as they will be allocated a stack slot below the CFA (Canonical // Frame Address, the stack pointer at entry to the function). unsigned ArgRegBegin = ARM::R4; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) break; CCValAssign &VA = ArgLocs[i]; unsigned Index = VA.getValNo(); ISD::ArgFlagsTy Flags = Ins[Index].Flags; if (!Flags.isByVal()) continue; assert(VA.isMemLoc() && "unexpected byval pointer in reg"); unsigned RBegin, REnd; CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); ArgRegBegin = std::min(ArgRegBegin, RBegin); CCInfo.nextInRegsParam(); } CCInfo.rewindByValRegsInfo(); int lastInsIndex = -1; if (isVarArg && MFI.hasVAStart()) { unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); if (RegIdx != array_lengthof(GPRArgRegs)) ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); auto PtrVT = getPointerTy(DAG.getDataLayout()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (Ins[VA.getValNo()].isOrigArg()) { std::advance(CurOrigArg, Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); } // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); if (VA.needsCustom()) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. if (VA.getLocVT() == MVT::v2f64) { SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); VA = ArgLocs[++i]; // skip ahead to next loc SDValue ArgValue2; if (VA.isMemLoc()) { int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FI)); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue1, DAG.getIntPtrConstant(0, dl)); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue2, DAG.getIntPtrConstant(1, dl)); } else ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; if (RegVT == MVT::f16) RC = &ARM::HPRRegClass; else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) RC = &ARM::DPRRegClass; else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this value is passed in r0 and has the returned attribute (e.g. // C++ 'structors), record this fact for later use. if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { AFI->setPreservesR0(); } } // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); break; case CCValAssign::ZExt: ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); break; } InVals.push_back(ArgValue); } else { // VA.isRegLoc() // sanity check assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); int index = VA.getValNo(); // Some Ins[] entries become multiple ArgLoc[] entries. // Process them only once. if (index != lastInsIndex) { ISD::ArgFlagsTy Flags = Ins[index].Flags; // FIXME: For now, all byval parameter objects are marked mutable. // This can be changed with more analysis. // In case of tail call optimization mark all arguments mutable. // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { assert(Ins[index].isOrigArg() && "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); int FrameIndex = StoreByValRegs( CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, VA.getLocMemOffset(), Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { unsigned FIOffset = VA.getLocMemOffset(); int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, FIOffset, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FI))); } lastInsIndex = index; } } } // varargs if (isVarArg && MFI.hasVAStart()) VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), TotalArgRegsSaveSize); AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); return Chain; } /// isFloatingPointZero - Return true if this is +0.0. static bool isFloatingPointZero(SDValue Op) { if (ConstantFPSDNode *CFP = dyn_cast(Op)) return CFP->getValueAPF().isPosZero(); else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { // Maybe this has already been legalized into the constant pool? if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { SDValue WrapperOp = Op.getOperand(1).getOperand(0); if (ConstantPoolSDNode *CP = dyn_cast(WrapperOp)) if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isPosZero(); } } else if (Op->getOpcode() == ISD::BITCAST && Op->getValueType(0) == MVT::f64) { // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) // created by LowerConstantFP(). SDValue BitcastOp = Op->getOperand(0); if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && isNullConstant(BitcastOp->getOperand(0))) return true; } return false; } /// Returns appropriate ARM CMP (cmp) and corresponding condition code for /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate((int32_t)C)) { // Constant does not fit, try adjusting it by one. switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETULT: case ISD::SETUGE: if (C != 0 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETLE: case ISD::SETGT: if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; case ISD::SETULE: case ISD::SETUGT: if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; } } } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { // In ARM and Thumb-2, the compare instructions can shift their second // operand. CC = ISD::getSetCCSwappedOperands(CC); std::swap(LHS, RHS); } // Thumb1 has very limited immediate modes, so turning an "and" into a // shift can save multiple instructions. // // If we have (x & C1), and C1 is an appropriate mask, we can transform it // into "((x << n) >> n)". But that isn't necessarily profitable on its // own. If it's the operand to an unsigned comparison with an immediate, // we can eliminate one of the shifts: we transform // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". // // We avoid transforming cases which aren't profitable due to encoding // details: // // 1. C2 fits into the immediate field of a cmp, and the transformed version // would not; in that case, we're essentially trading one immediate load for // another. // 2. C1 is 255 or 65535, so we can use uxtb or uxth. // 3. C2 is zero; we have other code for this special case. // // FIXME: Figure out profitability for Thumb2; we usually can't save an // instruction, since the AND is always one instruction anyway, but we could // use narrow instructions in some cases. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && LHS->hasOneUse() && isa(LHS.getOperand(1)) && LHS.getValueType() == MVT::i32 && isa(RHS) && !isSignedIntSetCC(CC)) { unsigned Mask = cast(LHS.getOperand(1))->getZExtValue(); auto *RHSC = cast(RHS.getNode()); uint64_t RHSV = RHSC->getZExtValue(); if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { unsigned ShiftBits = countLeadingZeros(Mask); if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); } } } // The specific comparison "(x< 0x80000000U" can be optimized to a // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same // way a cmp would. // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and // some tweaks to the heuristics for the previous and->shift transform. // FIXME: Optimize cases where the LHS isn't a shift. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && isa(RHS) && cast(RHS)->getZExtValue() == 0x80000000U && CC == ISD::SETUGT && isa(LHS.getOperand(1)) && cast(LHS.getOperand(1))->getZExtValue() < 31) { unsigned ShiftAmt = cast(LHS.getOperand(1))->getZExtValue() + 1; SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, MVT::i32), LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32)); SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, Shift.getValue(1), SDValue()); ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); return Chain.getValue(1); } ARMCC::CondCodes CondCode = IntCCToARMCC(CC); // If the RHS is a constant zero then the V (overflow) flag will never be // set. This can allow us to simplify GE to PL or LT to MI, which can be // simpler for other passes (like the peephole optimiser) to deal with. if (isNullConstant(RHS)) { switch (CondCode) { default: break; case ARMCC::GE: CondCode = ARMCC::PL; break; case ARMCC::LT: CondCode = ARMCC::MI; break; } } ARMISD::NodeType CompareType; switch (CondCode) { default: CompareType = ARMISD::CMP; break; case ARMCC::EQ: case ARMCC::NE: // Uses only Z Flag CompareType = ARMISD::CMPZ; break; } ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, bool InvalidOnQNaN) const { assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); SDValue Cmp; SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); else Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } /// duplicateCmp - Glue values can have only one use, so this function /// duplicates a comparison node. SDValue ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { unsigned Opc = Cmp.getOpcode(); SDLoc DL(Cmp); if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); Cmp = Cmp.getOperand(0); Opc = Cmp.getOpcode(); if (Opc == ARMISD::CMPFP) Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), Cmp.getOperand(1), Cmp.getOperand(2)); else { assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), Cmp.getOperand(1)); } return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } // This function returns three things: the arithmetic computation itself // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The // comparison and the condition code define the case in which the arithmetic // computation *does not* overflow. std::pair ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const { assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); SDValue Value, OverflowCmp; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDLoc dl(Op); // FIXME: We are currently always generating CMPs because we don't support // generating CMN through the backend. This is not as good as the natural // CMP case because it causes a register dependency and cannot be folded // later. switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::UADDO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); // We use ADDC here to correspond to its use in LowerUnsignedALUO. // We do not use it in the USUBO case as Value may not be used. Value = DAG.getNode(ARMISD::ADDC, dl, DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) .getValue(0); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::SSUBO: ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; case ISD::USUBO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; case ISD::UMULO: // We generate a UMUL_LOHI and then check if the high word is 0. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); Value = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(Op.getValueType(), Op.getValueType()), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), DAG.getConstant(0, dl, MVT::i32)); Value = Value.getValue(0); // We only want the low 32 bits for the result. break; case ISD::SMULO: // We generate a SMUL_LOHI and then check if all the bits of the high word // are the same as the sign bit of the low word. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); Value = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(Op.getValueType(), Op.getValueType()), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), DAG.getNode(ISD::SRA, dl, Op.getValueType(), Value.getValue(0), DAG.getConstant(31, dl, MVT::i32))); Value = Value.getValue(0); // We only want the low 32 bits for the result. break; } // switch (...) return std::make_pair(Value, OverflowCmp); } SDValue ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDLoc dl(Op); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); EVT VT = Op.getValueType(); SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, CCR, OverflowCmp); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG) { SDLoc DL(BoolCarry); EVT CarryVT = BoolCarry.getValueType(); // This converts the boolean value carry into the carry flag by doing // ARMISD::SUBC Carry, 1 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, DAG.getVTList(CarryVT, MVT::i32), BoolCarry, DAG.getConstant(1, DL, CarryVT)); return Carry.getValue(1); } static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG) { SDLoc DL(Flags); // Now convert the carry flag into a boolean carry. We do this // using ARMISD:ADDE 0, 0, Carry return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), Flags); } SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Value; SDValue Overflow; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::UADDO: Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); // Convert the carry flag into a boolean value. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); break; case ISD::USUBO: { Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); // Convert the carry flag into a boolean value. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow // value. So compute 1 - C. Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(1, dl, MVT::i32), Overflow); break; } } return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); SDValue SelectFalse = Op.getOperand(2); SDLoc dl(Op); unsigned Opc = Cond.getOpcode(); if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO)) { if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) return SDValue(); SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); EVT VT = Op.getValueType(); return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, OverflowCmp, DAG); } // Convert: // // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) // if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { const ConstantSDNode *CMOVTrue = dyn_cast(Cond.getOperand(0)); const ConstantSDNode *CMOVFalse = dyn_cast(Cond.getOperand(1)); if (CMOVTrue && CMOVFalse) { unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); SDValue True; SDValue False; if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { True = SelectTrue; False = SelectFalse; } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { True = SelectFalse; False = SelectTrue; } if (True.getNode() && False.getNode()) { EVT VT = Op.getValueType(); SDValue ARMcc = Cond.getOperand(2); SDValue CCR = Cond.getOperand(3); SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); assert(True.getValueType() == VT); return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); } } } // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the // undefined bits before doing a full-word comparison with zero. Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, DAG.getConstant(1, dl, Cond.getValueType())); return DAG.getSelectCC(dl, Cond, DAG.getConstant(0, dl, Cond.getValueType()), SelectTrue, SelectFalse, ISD::SETNE); } static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps) { // Start by selecting the GE condition code for opcodes that return true for // 'equality' if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) CondCode = ARMCC::GE; // and GT for opcodes that return false for 'equality'. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) CondCode = ARMCC::GT; // Since we are constrained to GE/GT, if the opcode contains 'less', we need // to swap the compare operands. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) swpCmpOps = true; // Both GT and GE are ordered comparisons, and return false for 'unordered'. // If we have an unordered opcode, we need to swap the operands to the VSEL // instruction (effectively negating the condition). // // This also has the effect of swapping which one of 'less' or 'greater' // returns true, so we also swap the compare operands. It also switches // whether we return true for 'equality', so we compensate by picking the // opposite condition code to our original choice. if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || CC == ISD::SETUGT) { swpCmpOps = !swpCmpOps; swpVselOps = !swpVselOps; CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; } // 'ordered' is 'anything but unordered', so use the VS condition code and // swap the VSEL operands. if (CC == ISD::SETO) { CondCode = ARMCC::VS; swpVselOps = true; } // 'unordered or not equal' is 'anything but equal', so use the EQ condition // code and swap the VSEL operands. Also do this if we don't care about the // unordered case. if (CC == ISD::SETUNE || CC == ISD::SETNE) { CondCode = ARMCC::EQ; swpVselOps = true; } } SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const { if (!Subtarget->hasFP64() && VT == MVT::f64) { FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), FalseVal); TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), TrueVal); SDValue TrueLow = TrueVal.getValue(0); SDValue TrueHigh = TrueVal.getValue(1); SDValue FalseLow = FalseVal.getValue(0); SDValue FalseHigh = FalseVal.getValue(1); SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, ARMcc, CCR, Cmp); SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, ARMcc, CCR, duplicateCmp(Cmp, DAG)); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); } else { return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); } } static bool isGTorGE(ISD::CondCode CC) { return CC == ISD::SETGT || CC == ISD::SETGE; } static bool isLTorLE(ISD::CondCode CC) { return CC == ISD::SETLT || CC == ISD::SETLE; } // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. // All of these conditions (and their <= and >= counterparts) will do: // x < k ? k : x // x > k ? x : k // k < x ? x : k // k > x ? k : x static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K) { return (isGTorGE(CC) && ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || (isLTorLE(CC) && ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); } // Similar to isLowerSaturate(), but checks for upper-saturating conditions. static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K) { return (isGTorGE(CC) && ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || (isLTorLE(CC) && ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); } // Check if two chained conditionals could be converted into SSAT or USAT. // // SSAT can replace a set of two conditional selectors that bound a number to an // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: // // x < -k ? -k : (x > k ? k : x) // x < -k ? -k : (x < k ? x : k) // x > -k ? (x > k ? k : x) : -k // x < k ? (x < -k ? -k : x) : k // etc. // // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is // a power of 2. // // It returns true if the conversion can be done, false otherwise. // Additionally, the variable is returned in parameter V, the constant in K and // usat is set to true if the conditional represents an unsigned saturation static bool isSaturatingConditional(const SDValue &Op, SDValue &V, uint64_t &K, bool &usat) { SDValue LHS1 = Op.getOperand(0); SDValue RHS1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); SDValue FalseVal1 = Op.getOperand(3); ISD::CondCode CC1 = cast(Op.getOperand(4))->get(); const SDValue Op2 = isa(TrueVal1) ? FalseVal1 : TrueVal1; if (Op2.getOpcode() != ISD::SELECT_CC) return false; SDValue LHS2 = Op2.getOperand(0); SDValue RHS2 = Op2.getOperand(1); SDValue TrueVal2 = Op2.getOperand(2); SDValue FalseVal2 = Op2.getOperand(3); ISD::CondCode CC2 = cast(Op2.getOperand(4))->get(); // Find out which are the constants and which are the variables // in each conditional SDValue *K1 = isa(LHS1) ? &LHS1 : isa(RHS1) ? &RHS1 : nullptr; SDValue *K2 = isa(LHS2) ? &LHS2 : isa(RHS2) ? &RHS2 : nullptr; SDValue K2Tmp = isa(TrueVal2) ? TrueVal2 : FalseVal2; SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; // We must detect cases where the original operations worked with 16- or // 8-bit values. In such case, V2Tmp != V2 because the comparison operations // must work with sign-extended values but the select operations return // the original non-extended value. SDValue V2TmpReg = V2Tmp; if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) V2TmpReg = V2Tmp->getOperand(0); // Check that the registers and the constants have the correct values // in both conditionals if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || V2TmpReg != V2) return false; // Figure out which conditional is saturating the lower/upper bound. const SDValue *LowerCheckOp = isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) ? &Op : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 : nullptr; const SDValue *UpperCheckOp = isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) ? &Op : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 : nullptr; if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) return false; // Check that the constant in the lower-bound check is // the opposite of the constant in the upper-bound check // in 1's complement. int64_t Val1 = cast(*K1)->getSExtValue(); int64_t Val2 = cast(*K2)->getSExtValue(); int64_t PosVal = std::max(Val1, Val2); int64_t NegVal = std::min(Val1, Val2); if (((Val1 > Val2 && UpperCheckOp == &Op) || (Val1 < Val2 && UpperCheckOp == &Op2)) && isPowerOf2_64(PosVal + 1)) { // Handle the difference between USAT (unsigned) and SSAT (signed) saturation if (Val1 == ~Val2) usat = false; else if (NegVal == 0) usat = true; else return false; V = V2; K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive return true; } return false; } // Check if a condition of the type x < k ? k : x can be converted into a // bit operation instead of conditional moves. // Currently this is allowed given: // - The conditions and values match up // - k is 0 or -1 (all ones) // This function will not check the last condition, thats up to the caller // It returns true if the transformation can be made, and in such case // returns x in V, and k in SatK. static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK) { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); SDValue *K = isa(LHS) ? &LHS : isa(RHS) ? &RHS : nullptr; // No constant operation in comparison, early out if (!K) return false; SDValue KTmp = isa(TrueVal) ? TrueVal : FalseVal; V = (KTmp == TrueVal) ? FalseVal : TrueVal; SDValue VTmp = (K && *K == LHS) ? RHS : LHS; // If the constant on left and right side, or variable on left and right, // does not match, early out if (*K != KTmp || V != VTmp) return false; if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { SatK = *K; return true; } return false; } bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { if (VT == MVT::f32) return !Subtarget->hasVFP2Base(); if (VT == MVT::f64) return !Subtarget->hasFP64(); if (VT == MVT::f16) return !Subtarget->hasFullFP16(); return false; } SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // Try to convert two saturating conditional selects into a single SSAT SDValue SatValue; uint64_t SatConstant; bool SatUSat; if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { if (SatUSat) return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); else return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); } // Try to convert expressions of the form x < k ? k : x (and similar forms) // into more efficient bit operations, which is possible when k is 0 or -1 // On ARM and Thumb-2 which have flexible operand 2 this will result in // single instructions. On Thumb the shift and the bit operation will be two // instructions. // Only allow this transformation on full-width (32-bit) operations SDValue LowerSatConstant; if (VT == MVT::i32 && isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, DAG.getConstant(31, dl, VT)); if (isNullConstant(LowerSatConstant)) { SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, DAG.getAllOnesConstant(dl, VT)); return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); } else if (isAllOnesConstant(LowerSatConstant)) return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); } SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); if (isUnsupportedFloatingType(LHS.getValueType())) { DAG.getTargetLoweringInfo().softenSetCCOperands( DAG, LHS.getValueType(), LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } if (LHS.getValueType() == MVT::i32) { // Try to generate VSEL on ARMv8. // The VSEL instruction can't use all the usual ARM condition // codes: it only has two bits to select the condition code, so it's // constrained to use only GE, GT, VS and EQ. // // To implement all the various ISD::SETXXX opcodes, we sometimes need to // swap the operands of the previous compare instruction (effectively // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { CC = ISD::getSetCCInverse(CC, true); std::swap(TrueVal, FalseVal); } } SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); // Choose GE over PL, which vsel does now support if (cast(ARMcc)->getZExtValue() == ARMCC::PL) ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); } ARMCC::CondCodes CondCode, CondCode2; bool InvalidOnQNaN; FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); // Normalize the fp compare. If RHS is zero we prefer to keep it there so we // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we // must use VSEL (limited condition codes), due to not having conditional f16 // moves. if (Subtarget->hasFPARMv8Base() && !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && (TrueVal.getValueType() == MVT::f16 || TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { if (swpCmpOps) std::swap(LHS, RHS); if (swpVselOps) std::swap(TrueVal, FalseVal); } } SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); } return Result; } /// canChangeToInt - Given the fp compare operand, return true if it is suitable /// to morph to an integer compare sequence. static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget) { SDNode *N = Op.getNode(); if (!N->hasOneUse()) // Otherwise it requires moving the value from fp to integer registers. return false; if (!N->getNumValues()) return false; EVT VT = Op.getValueType(); if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) // f32 case is generally profitable. f64 case only makes sense when vcmpe + // vmrs are very slow, e.g. cortex-a8. return false; if (isFloatingPointZero(Op)) { SeenZero = true; return true; } return ISD::isNormalLoad(N); } static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { if (isFloatingPointZero(Op)) return DAG.getConstant(0, SDLoc(Op), MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); llvm_unreachable("Unknown VFP cmp argument!"); } static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2) { SDLoc dl(Op); if (isFloatingPointZero(Op)) { RetVal1 = DAG.getConstant(0, dl, MVT::i32); RetVal2 = DAG.getConstant(0, dl, MVT::i32); return; } if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); RetVal1 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); SDValue NewPtr = DAG.getNode(ISD::ADD, dl, PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, Ld->getPointerInfo().getWithOffset(4), NewAlign, Ld->getMemOperand()->getFlags()); return; } llvm_unreachable("Unknown VFP cmp argument!"); } /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some /// f32 and even f64 comparisons to integer ones. SDValue ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); bool LHSSeenZero = false; bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); bool RHSSeenZero = false; bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { // If unsafe fp math optimization is enabled and there are no other uses of // the CMP operands, and the condition code is EQ or NE, we can optimize it // to an integer comparison. if (CC == ISD::SETOEQ) CC = ISD::SETEQ; else if (CC == ISD::SETUNE) CC = ISD::SETNE; SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); SDValue ARMcc; if (LHS.getValueType() == MVT::f32) { LHS = DAG.getNode(ISD::AND, dl, MVT::i32, bitcastf32Toi32(LHS, DAG), Mask); RHS = DAG.getNode(ISD::AND, dl, MVT::i32, bitcastf32Toi32(RHS, DAG), Mask); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, Cmp); } SDValue LHS1, LHS2; SDValue RHS1, RHS2; expandf64Toi32(LHS, DAG, LHS1, LHS2); expandf64Toi32(RHS, DAG, RHS1, RHS2); LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); ARMCC::CondCodes CondCode = IntCCToARMCC(CC); ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); } return SDValue(); } SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = Cond.getOpcode(); bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && !Subtarget->isThumb1Only(); if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || OptimizeMul)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) return SDValue(); // The actual operation with overflow check. SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); // Reverse the condition code. ARMCC::CondCodes CondCode = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); CondCode = ARMCC::getOppositeCondition(CondCode); ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, OverflowCmp); } return SDValue(); } SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); if (isUnsupportedFloatingType(LHS.getValueType())) { DAG.getTargetLoweringInfo().softenSetCCOperands( DAG, LHS.getValueType(), LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && !Subtarget->isThumb1Only(); if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || OptimizeMul) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); // The actual operation with overflow check. SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); if ((CC == ISD::SETNE) != isOneConstant(RHS)) { // Reverse the condition code. ARMCC::CondCodes CondCode = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); CondCode = ARMCC::getOppositeCondition(CondCode); ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); } SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, OverflowCmp); } if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, Cmp); } if (getTargetMachine().Options.UnsafeFPMath && (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE || CC == ISD::SETUNE)) { if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) return Result; } ARMCC::CondCodes CondCode, CondCode2; bool InvalidOnQNaN; FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); if (CondCode2 != ARMCC::AL) { ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); } return Res; } SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); SDLoc dl(Op); EVT PTy = getPointerTy(DAG.getDataLayout()); JumpTableSDNode *JT = cast(Table); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table // which does another jump to the destination. This also makes it easier // to translate it to TBB / TBH later (Thumb2 only). // FIXME: This might not work if the function is extremely large. return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, Addr, Op.getOperand(2), JTI); } if (isPositionIndependent() || Subtarget->isROPI()) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); if (Op.getValueType().getVectorElementType() == MVT::i32) { if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) return Op; return DAG.UnrollVectorOp(Op.getNode()); } const bool HasFullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); EVT NewTy; const EVT OpTy = Op.getOperand(0).getValueType(); if (OpTy == MVT::v4f32) NewTy = MVT::v4i32; else if (OpTy == MVT::v4f16 && HasFullFP16) NewTy = MVT::v4i16; else if (OpTy == MVT::v8f16 && HasFullFP16) NewTy = MVT::v8i16; else llvm_unreachable("Invalid type for custom lowering!"); if (VT != MVT::v4i16 && VT != MVT::v8i16) return DAG.UnrollVectorOp(Op.getNode()); Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); } SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } return Op; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { if (VT.getVectorElementType() == MVT::f32) return Op; return DAG.UnrollVectorOp(Op.getNode()); } assert((Op.getOperand(0).getValueType() == MVT::v4i16 || Op.getOperand(0).getValueType() == MVT::v8i16) && "Invalid type for custom lowering!"); const bool HasFullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); EVT DestVecType; if (VT == MVT::v4f32) DestVecType = MVT::v4i32; else if (VT == MVT::v4f16 && HasFullFP16) DestVecType = MVT::v4i16; else if (VT == MVT::v8f16 && HasFullFP16) DestVecType = MVT::v8i16; else return DAG.UnrollVectorOp(Op.getNode()); unsigned CastOpc; unsigned Opc; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid opcode!"); case ISD::SINT_TO_FP: CastOpc = ISD::SIGN_EXTEND; Opc = ISD::SINT_TO_FP; break; case ISD::UINT_TO_FP: CastOpc = ISD::ZERO_EXTEND; Opc = ISD::UINT_TO_FP; break; } Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); return DAG.getNode(Opc, dl, VT, Op); } SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); if (isUnsupportedFloatingType(VT)) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } return Op; } SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR; bool UseNEON = !InGPR && Subtarget->hasNEON(); if (UseNEON) { // Use VBSL to copy the sign bit. unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); if (VT == MVT::f32) { Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, DAG.getConstant(0, dl, MVT::i32)); } else { Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); } return Res; } // Bitcast operand 1 to i32. if (SrcVT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Tmp1).getValue(1); Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); // Or in the signbit with integer operations. SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); if (VT == MVT::f32) { Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); } // f64: Or the high part with signbit and then combine two parts. Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Tmp0); SDValue Lo = Tmp0.getValue(0); SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); EVT VT = Op.getValueType(); SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(4, dl, MVT::i32); return DAG.getLoad(VT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { const ARMBaseRegisterInfo &ARI = *static_cast(RegInfo); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); Register FrameReg = ARI.getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", ARM::SP) .Default(0); if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } // Result is 64 bit value so split into two 32 bit values and return as a // pair of values. static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); // This function is only supposed to be called for i64 type destination. assert(N->getValueType(0) == MVT::i64 && "ExpandREAD_REGISTER called for non-i64 type result."); SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), N->getOperand(0), N->getOperand(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), Read.getValue(1))); Results.push_back(Read.getOperand(0)); } /// \p BC is a bitcast that is about to be turned into a VMOVDRR. /// When \p DstVT, the destination type of \p BC, is on the vector /// register bank and the source of bitcast, \p Op, operates on the same bank, /// it might be possible to combine them, such that everything stays on the /// vector register bank. /// \p return The node that would replace \p BT, if the combine /// is possible. static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG) { SDValue Op = BC->getOperand(0); EVT DstVT = BC->getValueType(0); // The only vector instruction that can produce a scalar (remember, // since the bitcast was about to be turned into VMOVDRR, the source // type is i64) from a vector is EXTRACT_VECTOR_ELT. // Moreover, we can do this combine only if there is one use. // Finally, if the destination type is not a vector, there is not // much point on forcing everything on the vector bank. if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !Op.hasOneUse()) return SDValue(); // If the index is not constant, we will introduce an additional // multiply that will stick. // Give up in that case. ConstantSDNode *Index = dyn_cast(Op.getOperand(1)); if (!Index) return SDValue(); unsigned DstNumElt = DstVT.getVectorNumElements(); // Compute the new index. const APInt &APIntIndex = Index->getAPIntValue(); APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); NewIndex *= APIntIndex; // Check if the new constant index fits into i32. if (NewIndex.getBitWidth() > 32) return SDValue(); // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) SDLoc dl(Op); SDValue ExtractSrc = Op.getOperand(0); EVT VecVT = EVT::getVectorVT( *DAG.getContext(), DstVT.getScalarType(), ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); } /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); // This function is only supposed to be called for i64 types, either as the // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); const bool HasFullFP16 = Subtarget->hasFullFP16(); if (SrcVT == MVT::f32 && DstVT == MVT::i32) { // FullFP16: half values are passed in S-registers, and we don't // need any of the bitcast and moves: // // t2: f32,ch = CopyFromReg t0, Register:f32 %0 // t5: i32 = bitcast t2 // t18: f16 = ARMISD::VMOVhr t5 if (Op.getOpcode() != ISD::CopyFromReg || Op.getValueType() != MVT::f32) return SDValue(); auto Move = N->use_begin(); if (Move->getOpcode() != ARMISD::VMOVhr) return SDValue(); SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); DAG.ReplaceAllUsesWith(*Move, &Copy); return Copy; } if (SrcVT == MVT::i16 && DstVT == MVT::f16) { if (!HasFullFP16) return SDValue(); // SoftFP: read half-precision arguments: // // t2: i32,ch = ... // t7: i16 = truncate t2 <~~~~ Op // t8: f16 = bitcast t7 <~~~~ N // if (Op.getOperand(0).getValueType() == MVT::i32) return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), MVT::f16, Op.getOperand(0)); return SDValue(); } // Half-precision return values if (SrcVT == MVT::f16 && DstVT == MVT::i16) { if (!HasFullFP16) return SDValue(); // // t11: f16 = fadd t8, t10 // t12: i16 = bitcast t11 <~~~ SDNode N // t13: i32 = zero_extend t12 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 // // transform this into: // // t20: i32 = ARMISD::VMOVrh t11 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 // auto ZeroExtend = N->use_begin(); if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || ZeroExtend->getValueType(0) != MVT::i32) return SDValue(); auto Copy = ZeroExtend->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg && Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); return Cvt; } return SDValue(); } if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) return SDValue(); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { // Do not force values to GPRs (this is what VMOVDRR does for the inputs) // if we can combine the bitcast with its source. if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) return Val; SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, DstVT, DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { SDValue Cvt; if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && SrcVT.getVectorNumElements() > 1) Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); else Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Op); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); } return SDValue(); } /// getZeroVector - Returns a vector of specified type with all zero elements. /// Zero vectors are used to represent vector negation and in those cases /// will be implemented with the NEON VNEG instruction. However, VNEG does /// not support i64 elements, so sometimes the zero vectors will need to be /// explicitly constructed. Regardless, use a canonical VMOV to create the /// zero vector. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, DAG.getConstant(VTBits, dl, MVT::i32)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CCR, CmpLo); SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue HiBigShift = Opc == ISD::SRA ? DAG.getNode(Opc, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, VT)) : DAG.getConstant(0, dl, VT); SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CCR, CmpHi); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, DAG.getConstant(VTBits, dl, MVT::i32)); SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CCR, CmpHi); SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue Ops[] = { DAG.getEntryNode(), DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDLoc dl(N); EVT VT = N->getValueType(0); if (VT.isVector()) { assert(ST->hasNEON()); // Compute the least significant set bit: LSB = X & -X SDValue X = N->getOperand(0); SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); EVT ElemTy = VT.getVectorElementType(); if (ElemTy == MVT::i8) { // Compute with: cttz(x) = ctpop(lsb - 1) SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(1, dl, ElemTy)); SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 unsigned NumBits = ElemTy.getSizeInBits(); SDValue WidthMinus1 = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); } // Compute with: cttz(x) = ctpop(lsb - 1) // Compute LSB - 1. SDValue Bits; if (ElemTy == MVT::i64) { // Load constant 0xffff'ffff'ffff'ffff to register. SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(0x1eff, dl, MVT::i32)); Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); } else { SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(1, dl, ElemTy)); Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); } return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if (!ST->hasV6T2Ops()) return SDValue(); SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc DL(N); assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && "Unexpected type for custom ctpop lowering"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. unsigned EltSize = 8; unsigned NumElts = VT.is64BitVector() ? 8 : 16; while (EltSize != VT.getScalarSizeInBits()) { SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, TLI.getPointerTy(DAG.getDataLayout()))); Ops.push_back(Res); EltSize *= 2; NumElts /= 2; MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); } return Res; } /// Getvshiftimm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. For a shift opcode, the value /// is positive, but for an intrinsic the value count must be negative. The /// absolute value must be in the range: /// 1 <= |Value| <= ElementBits for a right shift; or /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; if (!isIntrinsic) return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; return true; } return false; } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); int64_t Cnt; if (!VT.isVector()) return SDValue(); // We essentially have two forms here. Shift by an immediate and shift by a // vector register (there are also shift by a gpr, but that is just handled // with a tablegen pattern). We cannot easily match shift by an immediate in // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. // For shifting by a vector, we don't have VSHR, only VSHL (which can be // signed or unsigned, and a negative shift indicates a shift right). if (N->getOpcode() == ISD::SHL) { if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), N->getOperand(1)); } assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } // Other right shifts we don't have operations for (we use a shift left by a // negative number). EVT ShiftVT = N->getOperand(1).getValueType(); SDValue NegatedCount = DAG.getNode( ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) return SDValue(); assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SHL) && "Unknown shift to lower!"); unsigned ShOpc = N->getOpcode(); if (ST->hasMVEIntegerOps()) { SDValue ShAmt = N->getOperand(1); unsigned ShPartsOpc = ARMISD::LSLL; ConstantSDNode *Con = dyn_cast(ShAmt); // If the shift amount is greater than 32 then do the default optimisation if (Con && Con->getZExtValue() > 32) return SDValue(); // Extract the lower 32 bits of the shift amount if it's an i64 if (ShAmt->getValueType(0) == MVT::i64) ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, DAG.getConstant(0, dl, MVT::i32)); if (ShOpc == ISD::SRL) { if (!Con) // There is no t2LSRLr instruction so negate and perform an lsll if the // shift amount is in a register, emulating a right shift. ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(0, dl, MVT::i32), ShAmt); else // Else generate an lsrl on the immediate shift amount ShPartsOpc = ARMISD::LSRL; } else if (ShOpc == ISD::SRA) ShPartsOpc = ARMISD::ASRL; // Lower 32 bits of the destination/source SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(0, dl, MVT::i32)); // Upper 32 bits of the destination/source SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(1, dl, MVT::i32)); // Generate the shift operation as computed above Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, ShAmt); // The upper 32 bits come from the second return value of lsll Hi = SDValue(Lo.getNode(), 1); return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } // We only lower SRA, SRL of 1 here, all others use generic lowering. if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) return SDValue(); // If we are in thumb mode, we don't have RRX. if (ST->isThumb1Only()) return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(1, dl, MVT::i32)); // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); // The low part is an ARMISD::RRX operand, which shifts the carry in. Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { bool Invert = false; bool Swap = false; unsigned Opc = ARMCC::AL; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); SDLoc dl(Op); EVT CmpVT; if (ST->hasNEON()) CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); else { assert(ST->hasMVEIntegerOps() && "No hardware support for integer vector comparison!"); if (Op.getValueType().getVectorElementType() != MVT::i1) return SDValue(); // Make sure we expand floating point setcc to scalar if we do not have // mve.fp, so that we can handle them from there. if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) return SDValue(); CmpVT = VT; } if (Op0.getValueType().getVectorElementType() == MVT::i64 && (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { // Special-case integer 64-bit equality comparisons. They aren't legal, // but they can be lowered with a few vector instructions. unsigned CmpElements = CmpVT.getVectorNumElements() * 2; EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, DAG.getCondCode(ISD::SETEQ)); SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); if (SetCCOpcode == ISD::SETNE) Merged = DAG.getNOT(dl, Merged, CmpVT); Merged = DAG.getSExtOrTrunc(Merged, dl, VT); return Merged; } if (CmpVT.getVectorElementType() == MVT::i64) // 64-bit comparisons are not legal in general. return SDValue(); if (Op1.getValueType().isFloatingPoint()) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); case ISD::SETUNE: case ISD::SETNE: if (ST->hasMVEFloatOps()) { Opc = ARMCC::NE; break; } else { Invert = true; LLVM_FALLTHROUGH; } case ISD::SETOEQ: case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETOLT: case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETOLE: case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: Opc = ARMCC::GE; break; case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; case ISD::SETONE: { // Expand this to (OLT | OGT). SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, DAG.getConstant(ARMCC::GT, dl, MVT::i32)); SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, DAG.getConstant(ARMCC::GT, dl, MVT::i32)); SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; case ISD::SETO: { // Expand this to (OLT | OGE). SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, DAG.getConstant(ARMCC::GT, dl, MVT::i32)); SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, DAG.getConstant(ARMCC::GE, dl, MVT::i32)); SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } } } else { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); case ISD::SETNE: if (ST->hasMVEIntegerOps()) { Opc = ARMCC::NE; break; } else { Invert = true; LLVM_FALLTHROUGH; } case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGE: Opc = ARMCC::GE; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = ARMCC::HI; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ARMCC::HS; break; } // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). if (ST->hasNEON() && Opc == ARMCC::EQ) { SDValue AndOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) AndOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) AndOp = Op1; // Ignore bitconvert. if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); if (!Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } } } if (Swap) std::swap(Op0, Op1); // If one of the operands is a constant vector zero, attempt to fold the // comparison to a specialized compare-against-zero form. SDValue SingleOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) SingleOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { if (Opc == ARMCC::GE) Opc = ARMCC::LE; else if (Opc == ARMCC::GT) Opc = ARMCC::LT; SingleOp = Op1; } SDValue Result; if (SingleOp.getNode()) { Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, DAG.getConstant(Opc, dl, MVT::i32)); } else { Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, DAG.getConstant(Opc, dl, MVT::i32)); } Result = DAG.getSExtOrTrunc(Result, dl, VT); if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we // have to invert the carry first. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), Carry); // This converts the boolean value carry into the carry flag. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); SDValue FVal = DAG.getConstant(0, DL, MVT::i32); SDValue TVal = DAG.getConstant(1, DL, MVT::i32); SDValue ARMcc = DAG.getConstant( IntCCToARMCC(cast(Cond)->get()), DL, MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, Cmp.getValue(1), SDValue()); return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, CCR, Chain.getValue(1)); } /// isVMOVModifiedImm - Check if the specified splat value corresponds to a /// valid vector constant for a NEON or MVE instruction with a "modified /// immediate" operand (e.g., VMOV). If so, return the encoded value. static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, bool is128Bits, VMOVModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified // immediate instructions others than VMOV do not support the 8-bit encoding // of a zero vector, and the default encoding of zero is supposed to be the // 32-bit version. if (SplatBits == 0) SplatBitSize = 32; switch (SplatBitSize) { case 8: if (type != VMOVModImm) return SDValue(); // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); OpCmode = 0xe; Imm = SplatBits; VT = is128Bits ? MVT::v16i8 : MVT::v8i8; break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. VT = is128Bits ? MVT::v8i16 : MVT::v4i16; if ((SplatBits & ~0xff) == 0) { // Value = 0x00nn: Op=x, Cmode=100x. OpCmode = 0x8; Imm = SplatBits; break; } if ((SplatBits & ~0xff00) == 0) { // Value = 0xnn00: Op=x, Cmode=101x. OpCmode = 0xa; Imm = SplatBits >> 8; break; } return SDValue(); case 32: // NEON's 32-bit VMOV supports splat values where: // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. VT = is128Bits ? MVT::v4i32 : MVT::v2i32; if ((SplatBits & ~0xff) == 0) { // Value = 0x000000nn: Op=x, Cmode=000x. OpCmode = 0; Imm = SplatBits; break; } if ((SplatBits & ~0xff00) == 0) { // Value = 0x0000nn00: Op=x, Cmode=001x. OpCmode = 0x2; Imm = SplatBits >> 8; break; } if ((SplatBits & ~0xff0000) == 0) { // Value = 0x00nn0000: Op=x, Cmode=010x. OpCmode = 0x4; Imm = SplatBits >> 16; break; } if ((SplatBits & ~0xff000000) == 0) { // Value = 0xnn000000: Op=x, Cmode=011x. OpCmode = 0x6; Imm = SplatBits >> 24; break; } // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC if (type == OtherModImm) return SDValue(); if ((SplatBits & ~0xffff) == 0 && ((SplatBits | SplatUndef) & 0xff) == 0xff) { // Value = 0x0000nnff: Op=x, Cmode=1100. OpCmode = 0xc; Imm = SplatBits >> 8; break; } // cmode == 0b1101 is not supported for MVE VMVN if (type == MVEVMVNModImm) return SDValue(); if ((SplatBits & ~0xffffff) == 0 && ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { // Value = 0x00nnffff: Op=x, Cmode=1101. OpCmode = 0xd; Imm = SplatBits >> 16; break; } // Note: there are a few 32-bit splat values (specifically: 00ffff00, // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not // VMOV.I32. A (very) minor optimization would be to replicate the value // and fall through here to test for a valid 64-bit splat. But, then the // caller would also need to check and handle the change in size. return SDValue(); case 64: { if (type != VMOVModImm) return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; uint64_t Val = 0; unsigned ImmMask = 1; Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { if (((SplatBits | SplatUndef) & BitMask) == BitMask) { Val |= BitMask; Imm |= ImmMask; } else if ((SplatBits & BitMask) != 0) { return SDValue(); } BitMask <<= 8; ImmMask <<= 1; } if (DAG.getDataLayout().isBigEndian()) // swap higher and lower 32 bit word Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); // Op=1, Cmode=1110. OpCmode = 0x1e; VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } default: llvm_unreachable("unexpected size for isVMOVModifiedImm"); } unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); } SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { EVT VT = Op.getValueType(); bool IsDouble = (VT == MVT::f64); ConstantFPSDNode *CFP = cast(Op); const APFloat &FPVal = CFP->getValueAPF(); // Prevent floating-point constants from using literal loads // when execute-only is enabled. if (ST->genExecuteOnly()) { // If we can represent the constant as an immediate, don't lower it if (isFPImmLegal(FPVal, VT)) return Op; // Otherwise, construct as integer, and move to float register APInt INTVal = FPVal.bitcastToAPInt(); SDLoc DL(CFP); switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unknown floating point type!"); break; case MVT::f64: { SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); if (!ST->isLittle()) std::swap(Lo, Hi); return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); } case MVT::f32: return DAG.getNode(ARMISD::VMOVSR, DL, VT, DAG.getConstant(INTVal, DL, MVT::i32)); } } if (!ST->hasVFP3Base()) return SDValue(); // Use the default (constant pool) lowering for double constants when we have // an SP-only FPU if (IsDouble && !Subtarget->hasFP64()) return SDValue(); // Try splatting with a VMOV.f32... int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); if (ImmVal != -1) { if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { // We have code in place to select a valid ConstantFP already, no need to // do any mangling. return Op; } // It's a float and we are trying to use NEON operations where // possible. Lower it to a splat followed by an extract. SDLoc DL(Op); SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, DAG.getConstant(0, DL, MVT::i32)); } // The rest of our options are NEON only, make sure that's allowed before // proceeding.. if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) return SDValue(); EVT VMovVT; uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); // It wouldn't really be worth bothering for doubles except for one very // important value, which does happen to match: 0.0. So make sure we don't do // anything stupid. if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) return SDValue(); // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, NewVal); if (IsDouble) return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, DAG.getConstant(0, DL, MVT::i32)); } // Finally, try a VMVN.i32 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); if (IsDouble) return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, DAG.getConstant(0, DL, MVT::i32)); } return SDValue(); } // check if an VEXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonVEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } static bool isVEXTMask(ArrayRef M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); ReverseVEXT = false; // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, it may still be // a VEXT but the source vectors must be swapped. ExpectedElt += 1; if (ExpectedElt == NumElts * 2) { ExpectedElt = 0; ReverseVEXT = true; } if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } // Adjust the index value if the source operands will be swapped. if (ReverseVEXT) Imm -= NumElts; return true; } /// isVREVMask - Check if a vector shuffle corresponds to a VREV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isVREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && "Only possible block sizes for VREV are: 16, 32, 64"); unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) return false; } return true; } static bool isVTBLMask(ArrayRef M, EVT VT) { // We can handle <8 x i8> vector shuffles. If the index in the mask is out of // range, then 0 is placed into the resulting vector. So pretty much any mask // of 8 elements can work here. return VT == MVT::v8i8 && M.size() == 8; } static unsigned SelectPairHalf(unsigned Elements, ArrayRef Mask, unsigned Index) { if (Mask.size() == Elements * 2) return Index / Elements; return Mask[Index] == 0 ? 0 : 1; } // Checks whether the shuffle mask represents a vector transpose (VTRN) by // checking that pairs of elements in the shuffle mask represent the same index // in each vector, incrementing the expected index by 2 at each step. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} // v2={e,f,g,h} // WhichResult gives the offset for each element in the mask based on which // of the two results it belongs to. // // The transpose can be represented either as: // result1 = shufflevector v1, v2, result1_shuffle_mask // result2 = shufflevector v1, v2, result2_shuffle_mask // where v1/v2 and the shuffle masks have the same number of elements // (here WhichResult (see below) indicates which result is being checked) // // or as: // results = shufflevector v1, v2, shuffle_mask // where both results are returned in one vector and the shuffle mask has twice // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we // want to check the low half and high half of the shuffle mask as if it were // the other case static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; // If the mask is twice as long as the input vector then we need to check the // upper and lower parts of the mask with a matching value for WhichResult // FIXME: A mask with only even values will be rejected in case the first // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only // M[0] is used to determine WhichResult for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) return false; } } if (M.size() == NumElts*2) WhichResult = 0; return true; } /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) return false; } } if (M.size() == NumElts*2) WhichResult = 0; return true; } // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking // that the mask elements are either all even and in steps of size 2 or all odd // and in steps of size 2. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} // v2={e,f,g,h} // Requires similar checks to that of isVTRNMask with // respect the how results are returned. static bool isVUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; ++j) { if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) return false; } } if (M.size() == NumElts*2) WhichResult = 0; // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; unsigned Half = NumElts / 2; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; j += Half) { unsigned Idx = WhichResult; for (unsigned k = 0; k < Half; ++k) { int MIdx = M[i + j + k]; if (MIdx >= 0 && (unsigned) MIdx != Idx) return false; Idx += 2; } } } if (M.size() == NumElts*2) WhichResult = 0; // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } // Checks whether the shuffle mask represents a vector zip (VZIP) by checking // that pairs of elements of the shufflemask represent the same index in each // vector incrementing sequentially through the vectors. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} // v2={e,f,g,h} // Requires similar checks to that of isVTRNMask with respect the how results // are returned. static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) return false; Idx += 1; } } if (M.size() == NumElts*2) WhichResult = 0; // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) return false; Idx += 1; } } if (M.size() == NumElts*2) WhichResult = 0; // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. static unsigned isNEONTwoResultShuffleMask(ArrayRef ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF) { isV_UNDEF = false; if (isVTRNMask(ShuffleMask, VT, WhichResult)) return ARMISD::VTRN; if (isVUZPMask(ShuffleMask, VT, WhichResult)) return ARMISD::VUZP; if (isVZIPMask(ShuffleMask, VT, WhichResult)) return ARMISD::VZIP; isV_UNDEF = true; if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VTRN; if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VUZP; if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VZIP; return 0; } /// \return true if this is a reverse operation on an vector. static bool isReverseMask(ArrayRef M, EVT VT) { unsigned NumElts = VT.getVectorNumElements(); // Make sure the mask has the right size. if (NumElts != M.size()) return false; // Look for <15, ..., 3, -1, 1, 0>. for (unsigned i = 0; i != NumElts; ++i) if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) return false; return true; } // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl) { uint64_t Val; if (!isa(N)) return SDValue(); Val = cast(N)->getZExtValue(); if (ST->isThumb1Only()) { if (Val <= 255 || ~Val <= 255) return DAG.getConstant(Val, dl, MVT::i32); } else { if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) return DAG.getConstant(Val, dl, MVT::i32); } return SDValue(); } static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDLoc dl(Op); EVT VT = Op.getValueType(); assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); unsigned NumElts = VT.getVectorNumElements(); unsigned BoolMask; unsigned BitsPerBool; if (NumElts == 4) { BitsPerBool = 4; BoolMask = 0xf; } else if (NumElts == 8) { BitsPerBool = 2; BoolMask = 0x3; } else if (NumElts == 16) { BitsPerBool = 1; BoolMask = 0x1; } else return SDValue(); // First create base with bits set where known unsigned Bits32 = 0; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (!isa(V) && !V.isUndef()) continue; bool BitSet = V.isUndef() ? false : cast(V)->getZExtValue(); if (BitSet) Bits32 |= BoolMask << (i * BitsPerBool); } // Add in unknown nodes // FIXME: Handle splats of the same value better. SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, DAG.getConstant(Bits32, dl, MVT::i32)); for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (isa(V) || V.isUndef()) continue; Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, DAG.getConstant(i, dl, MVT::i32)); } return Base; } // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast(Op.getNode()); SDLoc dl(Op); EVT VT = Op.getValueType(); if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) return LowerBUILD_VECTOR_i1(Op, DAG, ST); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatUndef.isAllOnesValue()) return DAG.getUNDEF(VT); if ((ST->hasNEON() && SplatBitSize <= 64) || (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { // Check if an immediate VMOV works. EVT VmovVT; SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isVMOVModifiedImm( NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Use vmov.f32 to materialize other v2f32 and v4f32 splats. if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { int ImmVal = ARM_AM::getFP32Imm(SplatBits); if (ImmVal != -1) { SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); } } } } // Scan through the operands to see if only one value is used. // // As an optimisation, even if more than one value is used it may be more // profitable to splat with one value then change some lanes. // // Heuristically we decide to do this if the vector has a "dominant" value, // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool hasDominantValue = false; bool isConstant = true; // Map of the number of times a particular SDValue appears in the // element list. DenseMap ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) isConstant = false; ValueCounts.insert(std::make_pair(V, 0)); unsigned &Count = ValueCounts[V]; // Is this value dominant? (takes up more than half of the lanes) if (++Count > (NumElts / 2)) { hasDominantValue = true; Value = V; } } if (ValueCounts.size() != 1) usesOnlyOneValue = false; if (!Value.getNode() && !ValueCounts.empty()) Value = ValueCounts.begin()->first; if (ValueCounts.empty()) return DAG.getUNDEF(VT); // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. // Keep going if we are hitting this case. if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); unsigned EltSize = VT.getScalarSizeInBits(); // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (hasDominantValue && EltSize <= 32) { if (!isConstant) { SDValue N; // If we are VDUPing a value that comes directly from a vector, that will // cause an unnecessary move to and from a GPR, where instead we could // just use VDUPLANE. We can only do this if the lane being extracted // is at a constant index, as the VDUP from lane instructions only have // constant-index forms. ConstantSDNode *constIndex; if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && (constIndex = dyn_cast(Value->getOperand(1)))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element // such that the register coalescer will remove unnecessary copies. if (VT != Value->getOperand(0).getValueType()) { unsigned index = constIndex->getAPIntValue().getLimitedValue() % VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), Value, DAG.getConstant(index, dl, MVT::i32)), DAG.getConstant(index, dl, MVT::i32)); } else N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, Value->getOperand(0), Value->getOperand(1)); } else N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); if (!usesOnlyOneValue) { // The dominant value was splatted as 'N', but we now have to insert // all differing elements. for (unsigned I = 0; I < NumElts; ++I) { if (Op.getOperand(I) == Value) continue; SmallVector Ops; Ops.push_back(N); Ops.push_back(Op.getOperand(I)); Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); } } return N; } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; MVT FVT = VT.getVectorElementType().getSimpleVT(); assert(FVT == MVT::f32 || FVT == MVT::f16); MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } if (usesOnlyOneValue) { SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); if (isConstant && Val.getNode()) return DAG.getNode(ARMISD::VDUP, dl, VT, Val); } } // If all elements are constants and the case above didn't get hit, fall back // to the default expansion, which will generate a load from the constant // pool. if (isConstant) return SDValue(); // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { SDValue shuffle = ReconstructShuffle(Op, DAG); if (shuffle != SDValue()) return shuffle; } if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector // into two 64-bit vectors; we might discover a better way to lower it. SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElts); EVT ExtVT = VT.getVectorElementType(); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); SDValue Lower = DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); if (Lower.getOpcode() == ISD::BUILD_VECTOR) Lower = LowerBUILD_VECTOR(Lower, DAG, ST); SDValue Upper = DAG.getBuildVector( HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); if (Upper.getOpcode() == ISD::BUILD_VECTOR) Upper = LowerBUILD_VECTOR(Upper, DAG, ST); if (Lower && Upper) return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); } // Vectors with 32- or 64-bit elements can be built by directly assigning // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands // will be legalized. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); for (unsigned i = 0 ; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } return SDValue(); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt = std::numeric_limits::max(); unsigned MaxElt = 0; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase = 0; int WindowScale = 1; ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); } else if (!isa(V.getOperand(1))) { // Furthermore, shuffles require a constant mask, whereas extractelts // accept variable indices. return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = llvm::find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors // are involved. if (Sources.size() > 2) return SDValue(); // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) SmallestEltTy = SrcEltTy; } unsigned ResMultiplier = VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) return SDValue(); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i32)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i32)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i32)); Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Src.MinElt, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final sanity check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT);); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.isUndef()) continue; auto Src = llvm::find(Sources, Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) return SDValue(); // We can't handle more than two sources. This should have already // been checked before this point. assert(Sources.size() <= 2 && "Too many sources!"); SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } enum ShuffleOpCodes { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR // VTRN, right result }; static bool isLegalMVEShuffleOp(unsigned PFEntry) { unsigned OpNum = (PFEntry >> 26) & 0x0F; switch (OpNum) { case OP_COPY: case OP_VREV: case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: return true; } return false; } /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (M[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = M[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) return true; } bool ReverseVEXT, isV_UNDEF; unsigned Imm, WhichResult; unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize >= 32 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16)) return true; else if (Subtarget->hasNEON() && (isVEXTMask(M, VT, ReverseVEXT, Imm) || isVTBLMask(M, VT) || isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) return true; else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)) return true; else return false; } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); // vrev <4 x i16> -> VREV32 if (VT.getVectorElementType() == MVT::i16) return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); // vrev <4 x i8> -> VREV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: return DAG.getNode(ARMISD::VDUPLANE, dl, VT, OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: return DAG.getNode(ARMISD::VEXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); case OP_VUZPL: case OP_VUZPR: return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); case OP_VZIPL: case OP_VZIPR: return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); case OP_VTRNL: case OP_VTRNR: return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); } } static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the VTBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); SmallVector VTBLMask; for (ArrayRef::iterator I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); if (V2.getNode()->isUndef()) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); } static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue OpLHS = Op.getOperand(0); EVT VT = OpLHS.getValueType(); assert((VT == MVT::v8i16 || VT == MVT::v16i8) && "Expect an v8i16/v16i8 type"); OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, // extract the first 8 bytes into the top double word and the last 8 bytes // into the bottom double word. The v8i16 case is similar. unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, DAG.getConstant(ExtractNum, DL, MVT::i32)); } static EVT getVectorTyFromPredicateVector(EVT VT) { switch (VT.getSimpleVT().SimpleTy) { case MVT::v4i1: return MVT::v4i32; case MVT::v8i1: return MVT::v8i16; case MVT::v16i1: return MVT::v16i8; default: llvm_unreachable("Unexpected vector predicate type"); } } static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG) { // Converting from boolean predicates to integers involves creating a vector // of all ones or all zeroes and selecting the lanes based upon the real // predicate. SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); SDValue AllZeroes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); // Get full vector type from predicate type EVT NewVT = getVectorTyFromPredicateVector(VT); SDValue RecastV1; // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast // this to a v16i1. This cannot be done with an ordinary bitcast because the // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, // since we know in hardware the sizes are really the same. if (VT != MVT::v16i1) RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); else RecastV1 = Pred; // Select either all ones or zeroes depending upon the real predicate bits. SDValue PredAsVector = DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); // Recast our new predicate-as-integer v16i8 vector into something // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); } static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); ArrayRef ShuffleMask = SVN->getMask(); assert(ST->hasMVEIntegerOps() && "No support for vector shuffle of boolean predicates"); SDValue V1 = Op.getOperand(0); SDLoc dl(Op); if (isReverseMask(ShuffleMask, VT)) { SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, DAG.getConstant(16, dl, MVT::i32)); return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); } // Until we can come up with optimised cases for every single vector // shuffle in existence we have chosen the least painful strategy. This is // to essentially promote the boolean predicate to a 8-bit integer, where // each predicate represents a byte. Then we fall back on a normal integer // vector shuffle and convert the result back into a predicate vector. In // many cases the generated code might be even better than scalar code // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit // fields in a register into 8 other arbitrary 2-bit fields! SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); EVT NewVT = PredAsVector.getValueType(); // Do the shuffle! SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, DAG.getUNDEF(NewVT), ShuffleMask); // Now return the result of comparing the shuffled vector with zero, // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); unsigned EltSize = VT.getScalarSizeInBits(); if (ST->hasMVEIntegerOps() && EltSize == 1) return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. // FIXME: floating-point vectors should be canonicalized to integer vectors // of the same time so that they get CSEd properly. ArrayRef ShuffleMask = SVN->getMask(); if (EltSize <= 32) { if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; // Test if V1 is a SCALAR_TO_VECTOR. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR // (and probably will turn into a SCALAR_TO_VECTOR once legalization // reaches it). if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(0))) { bool IsScalarToVector = true; for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) if (!V1.getOperand(i).isUndef()) { IsScalarToVector = false; break; } if (IsScalarToVector) return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i32)); } bool ReverseVEXT = false; unsigned Imm = 0; if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } if (isVREVMask(ShuffleMask, VT, 64)) return DAG.getNode(ARMISD::VREV64, dl, VT, V1); if (isVREVMask(ShuffleMask, VT, 32)) return DAG.getNode(ARMISD::VREV32, dl, VT, V1); if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } // Check for Neon shuffles that modify both input vectors in place. // If both results are used, i.e., if there are two shuffles with the same // source operands and with masks corresponding to both results of one of // these operations, DAG memoization will ensure that a single node is // used for both shuffles. unsigned WhichResult = 0; bool isV_UNDEF = false; if (ST->hasNEON()) { if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( ShuffleMask, VT, WhichResult, isV_UNDEF)) { if (isV_UNDEF) V2 = V1; return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) .getValue(WhichResult); } } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize // shuffles that produce a result larger than their operands with: // shuffle(concat(v1, undef), concat(v2, undef)) // -> // shuffle(concat(v1, v2), undef) // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). // // This is useful in the general case, but there are special cases where // native shuffles produce larger results: the two-result ops. // // Look through the concat when lowering them: // shuffle(concat(v1, v2), undef) // -> // concat(VZIP(v1, v2):0, :1) // if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); // We expect these to have been canonicalized to -1. assert(llvm::all_of(ShuffleMask, [&](int i) { return i < (int)VT.getVectorNumElements(); }) && "Unexpected shuffle index into UNDEF operand!"); if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { if (isV_UNDEF) SubV2 = SubV1; assert((WhichResult == 0) && "In-place shuffle of concat can only have one result!"); SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), SubV1, SubV2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), Res.getValue(1)); } } } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) { if (ST->hasNEON()) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); else if (isLegalMVEShuffleOp(PFEntry)) { unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } } } // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) { if (ShuffleMask[i] < 0) Ops.push_back(DAG.getUNDEF(EltVT)); else Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShuffleMask[i] < (int)NumElts ? V1 : V2, DAG.getConstant(ShuffleMask[i] & (NumElts-1), dl, MVT::i32))); } SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); } if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); if (ST->hasNEON() && VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; return SDValue(); } static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VecVT = Op.getOperand(0).getValueType(); SDLoc dl(Op); assert(ST->hasMVEIntegerOps() && "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); SDValue Conv = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); unsigned Lane = cast(Op.getOperand(2))->getZExtValue(); unsigned LaneWidth = getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, Op.getOperand(1), DAG.getValueType(MVT::i1)); SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, DAG.getConstant(~Mask, dl, MVT::i32)); return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); } SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa(Lane)) return SDValue(); SDValue Elt = Op.getOperand(1); EVT EltVT = Elt.getValueType(); if (Subtarget->hasMVEIntegerOps() && Op.getValueType().getScalarSizeInBits() == 1) return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); if (getTypeAction(*DAG.getContext(), EltVT) == TargetLowering::TypePromoteFloat) { // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, // but the type system will try to do that if we don't intervene. // Reinterpret any such vector-element insertion as one with the // corresponding integer types. SDLoc dl(Op); EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); assert(getTypeAction(*DAG.getContext(), IEltVT) != TargetLowering::TypePromoteFloat); SDValue VecIn = Op.getOperand(0); EVT VecVT = VecIn.getValueType(); EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, VecVT.getVectorNumElements()); SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, IVecIn, IElt, Lane); return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); } return Op; } static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VecVT = Op.getOperand(0).getValueType(); SDLoc dl(Op); assert(ST->hasMVEIntegerOps() && "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); SDValue Conv = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); unsigned Lane = cast(Op.getOperand(1))->getZExtValue(); unsigned LaneWidth = getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); return Shift; } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); if (!isa(Lane)) return SDValue(); SDValue Vec = Op.getOperand(0); EVT VT = Vec.getValueType(); if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); } return Op; } static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT Op1VT = V1.getValueType(); EVT Op2VT = V2.getValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(Op1VT == Op2VT && "Operand types don't match!"); assert(VT.getScalarSizeInBits() == 1 && "Unexpected custom CONCAT_VECTORS lowering"); assert(ST->hasMVEIntegerOps() && "CONCAT_VECTORS lowering only supported for MVE"); SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets // promoted to v8i16, etc. MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); // Extract the vector elements from Op1 and Op2 one by one and truncate them // to be the right size for the destination. For example, if Op1 is v4i1 then // the promoted vector is v4i32. The result of concatentation gives a v8i1, // which when promoted is v8i16. That means each i32 element from Op1 needs // truncating to i16 and inserting in the result. EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { EVT NewVT = NewV.getValueType(); EVT ConcatVT = ConVec.getValueType(); for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, DAG.getIntPtrConstant(i, dl)); ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, DAG.getConstant(j, dl, MVT::i32)); } return ConVec; }; unsigned j = 0; ConVec = ExractInto(NewV1, ConVec, j); ConVec = ExractInto(NewV2, ConVec, j); // Now return the result of comparing the subvector with zero, // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = Op->getValueType(0); if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) return LowerCONCAT_VECTORS_i1(Op, DAG, ST); // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && "unexpected CONCAT_VECTORS"); SDLoc dl(Op); SDValue Val = DAG.getUNDEF(MVT::v2f64); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (!Op0.isUndef()) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), DAG.getIntPtrConstant(0, dl)); if (!Op1.isUndef()) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), DAG.getIntPtrConstant(1, dl)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT Op1VT = V1.getValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned Index = cast(V2)->getZExtValue(); assert(VT.getScalarSizeInBits() == 1 && "Unexpected custom EXTRACT_SUBVECTOR lowering"); assert(ST->hasMVEIntegerOps() && "EXTRACT_SUBVECTOR lowering only supported for MVE"); SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); // We now have Op1 promoted to a vector of integers, where v8i1 gets // promoted to v8i16, etc. MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); EVT SubVT = MVT::getVectorVT(ElType, NumElts); SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, DAG.getIntPtrConstant(i, dl)); SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, DAG.getConstant(j, dl, MVT::i32)); } // Now return the result of comparing the subvector with zero, // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. EVT VT = N->getValueType(0); if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { SDNode *BVN = N->getOperand(0).getNode(); if (BVN->getValueType(0) != MVT::v4i32 || BVN->getOpcode() != ISD::BUILD_VECTOR) return false; unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; unsigned HiElt = 1 - LoElt; ConstantSDNode *Lo0 = dyn_cast(BVN->getOperand(LoElt)); ConstantSDNode *Hi0 = dyn_cast(BVN->getOperand(HiElt)); ConstantSDNode *Lo1 = dyn_cast(BVN->getOperand(LoElt+2)); ConstantSDNode *Hi1 = dyn_cast(BVN->getOperand(HiElt+2)); if (!Lo0 || !Hi0 || !Lo1 || !Hi1) return false; if (isSigned) { if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) return true; } else { if (Hi0->isNullValue() && Hi1->isNullValue()) return true; } return false; } if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *Elt = N->getOperand(i).getNode(); if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getScalarSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } /// isSignExtended - Check if a node is a vector value that is sign-extended /// or a constant BUILD_VECTOR with sign-extended elements. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, true)) return true; return false; } /// isZeroExtended - Check if a node is a vector value that is zero-extended /// or a constant BUILD_VECTOR with zero-extended elements. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, false)) return true; return false; } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. /// We insert the required extension here to get the vector to fill a D register. static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } /// SkipLoadExtensionForVMULL - return a load of the original vector size that /// does not do any sign/zero extension. If the original vector is less /// than 64 bits, an appropriate extension will be added after the load to /// reach a total size of 64 bits. We have to add the extension separately /// because ARM does not have a sign/zero extending load for vectors. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); // The load already has the right type. if (ExtendedTy == LD->getMemoryVT()) return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->getAlignment(), LD->getMemOperand()->getFlags()); // We need to create a zextload/sextload. We cannot just create a load // followed by a zext/zext node because LowerMUL is also run during normal // operation legalization where we can't create illegal types. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->getMemoryVT(), LD->getAlignment(), LD->getMemOperand()->getFlags()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, /// extending load, or BUILD_VECTOR with extended elements, return the /// unextended value. The unextended vector should be 64 bits so that it can /// be used as an operand to a VMULL instruction. If the original vector size /// before extension is less than 64 bits we add a an extension to resize /// the vector to 64 bits. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); if (LoadSDNode *LD = dyn_cast(N)) { assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && "Expected extending load"); SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue extLoad = DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); return newLoad; } // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will // have been legalized as a BITCAST from v4i32. if (N->getOpcode() == ISD::BITCAST) { SDNode *BVN = N->getOperand(0).getNode(); assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; return DAG.getBuildVector( MVT::v2i32, SDLoc(N), {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); } // Construct a new BUILD_VECTOR with elements truncated to half the size. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); unsigned EltSize = VT.getScalarSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; SDLoc dl(N); for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); if (isN0SExt && isN1SExt) NewOpc = ARMISD::VMULLs; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); if (isN0ZExt && isN1ZExt) NewOpc = ARMISD::VMULLu; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (isN1SExt && isAddSubSExt(N0, DAG)) { NewOpc = ARMISD::VMULLs; isMLA = true; } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { NewOpc = ARMISD::VMULLu; isMLA = true; } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); NewOpc = ARMISD::VMULLu; isMLA = true; } } if (!NewOpc) { if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); else // Other vector multiplications are legal. return Op; } } // Legalize to a VMULL instruction. SDLoc DL(Op); SDValue Op0; SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { Op0 = SkipExtensionForVMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during // isel lowering to take advantage of no-stall back to back vmul + vmla. // vmull q0, d4, d6 // vmlal q0, d5, d6 // is faster than // vaddl q0, d4, d5 // vmovl q1, d6 // vmul q0, q0, q1 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); // Get reciprocal estimate. // float4 recip = vrecpeq_f32(yf); Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), Y); // Because char has a smaller range than uchar, we can actually get away // without any newton steps. This requires that we use a weird bias // of 0xb000, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0xb000); X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); Y = DAG.getConstant(0xb000, dl, MVT::v4i32); X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); // Convert back to short. X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); return X; } static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); // float4 xf = vcvt_f32_s32(vmovl_s16(x)); N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and one refinement step. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), N1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Because short has a smaller range than ushort, we can actually get away // with only a single newton step. This requires that we use a weird bias // of 89, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0x89); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(0x89, dl, MVT::v4i32); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. // return vmovn_s32(vcvt_s32_f32(result)); N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); return N0; } static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; } return LowerSDIV_v4i16(N0, N1, dl, DAG); } static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, MVT::i32), N0); return N0; } // v4i16 sdiv ... Convert to float. // float4 yf = vcvt_f32_s32(vmovl_u16(y)); // float4 xf = vcvt_f32_s32(vmovl_u16(x)); N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and two refinement steps. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), BN1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Simply multiplying by the reciprocal estimate can leave us a few ulps // too low, so we add 2 ulps (exhaustive testing shows that this is enough, // and that it will never cause us to return an answer too large). // float4 result = as_float4(as_int4(xf*recip) + 2); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(2, dl, MVT::v4i32); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. // return vmovn_u32(vcvt_s32_f32(result)); N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); return N0; } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); EVT VT = N->getValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Carry = Op.getOperand(2); SDLoc DL(Op); SDValue Result; if (Op.getOpcode() == ISD::ADDCARRY) { // This converts the boolean value carry into the carry flag. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); // Do the addition proper using the carry flag we wanted. Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry); // Now convert the carry flag into a boolean value. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); } else { // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we // have to invert the carry first. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), Carry); // This converts the boolean value carry into the carry flag. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); // Do the subtraction proper using the carry flag we wanted. Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry); // Now convert the carry flag into a boolean value. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); // But the carry returned by ARMISD::SUBE is not a borrow as expected // by ISD::SUBCARRY, so compute 1 - C. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), Carry); } // Return both values. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin()); // For iOS, we want to call an alternative entry point: __sincos_stret, // return values are passed via sret. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; bool ShouldUseSRet = Subtarget->isAPCS_ABI(); SDValue SRet; if (ShouldUseSRet) { // Create stack object for sret. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); ArgListEntry Entry; Entry.Node = SRet; Entry.Ty = RetTy->getPointerTo(); Entry.IsSExt = false; Entry.IsZExt = false; Entry.IsSRet = true; Args.push_back(Entry); RetTy = Type::getVoidTy(*DAG.getContext()); } ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); RTLIB::Libcall LC = (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = getLibcallName(LC); CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setCallee(CC, RetTy, Callee, std::move(Args)) .setDiscardResult(ShouldUseSRet); std::pair CallResult = LowerCallTo(CLI); if (!ShouldUseSRet) return CallResult.first; SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); // Address of cos field. SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), LoadCos.getValue(0)); } SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { EVT VT = Op.getValueType(); assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected type for custom lowering DIV"); SDLoc dl(Op); const auto &DL = DAG.getDataLayout(); const auto &TLI = DAG.getTargetLoweringInfo(); const char *Name = nullptr; if (Signed) Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; else Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); ARMTargetLowering::ArgListTy Args; for (auto AI : {1, 0}) { ArgListEntry Arg; Arg.Node = Op.getOperand(AI); Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); Args.push_back(Arg); } CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args)); return LowerCallTo(CLI).first; } // This is a code size optimisation: return the original SDIV node to // DAGCombiner when we don't want to expand SDIV into a sequence of // instructions, and an empty node otherwise which will cause the // SDIV to be expanded in DAGCombine. SDValue ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { // TODO: Support SREM if (N->getOpcode() != ISD::SDIV) return SDValue(); const auto &ST = static_cast(DAG.getSubtarget()); const bool MinSize = ST.hasMinSize(); const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() : ST.hasDivideInARMMode(); // Don't touch vector types; rewriting this may lead to scalarizing // the int divs. if (N->getOperand(0).getValueType().isVector()) return SDValue(); // Bail if MinSize is not set, and also for both ARM and Thumb mode we need // hwdiv support for this to be really profitable. if (!(MinSize && HasDivide)) return SDValue(); // ARM mode is a bit simpler than Thumb: we can handle large power // of 2 immediates with 1 mov instruction; no further checks required, // just return the sdiv node. if (!ST.isThumb()) return SDValue(N, 0); // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, // and thus lose the code size benefits of a MOVS that requires only 2. // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, // but as it's doing exactly this, it's not worth the trouble to get TTI. if (Divisor.sgt(128)) return SDValue(); return SDValue(N, 0); } SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const { assert(Op.getValueType() == MVT::i32 && "unexpected type for custom lowering DIV"); SDLoc dl(Op); SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Op.getOperand(1)); return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); } static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { SDLoc DL(N); SDValue Op = N->getOperand(1); if (N->getValueType(0) == MVT::i32) return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, DAG.getConstant(0, DL, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, DAG.getConstant(1, DL, MVT::i32)); return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); } void ARMTargetLowering::ExpandDIV_Windows( SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl &Results) const { const auto &DL = DAG.getDataLayout(); const auto &TLI = DAG.getTargetLoweringInfo(); assert(Op.getValueType() == MVT::i64 && "unexpected type for custom lowering DIV"); SDLoc dl(Op); SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, DAG.getConstant(32, dl, TLI.getPointerTy(DL))); Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); Results.push_back(Lower); Results.push_back(Upper); } static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or // equivalent available. return SDValue(); // Monotonic load/store is legal for all targets. return Op; } static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc DL(N); // Under Power Management extensions, the cycle-count is: // mrc p15, #0, , c9, c13, #0 SDValue Ops[] = { N->getOperand(0), // Chain DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), DAG.getConstant(15, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(9, DL, MVT::i32), DAG.getConstant(13, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32) }; SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, DAG.getConstant(0, DL, MVT::i32))); Results.push_back(Cycles32.getValue(1)); } static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), dl, MVT::i32); bool isBigEndian = DAG.getDataLayout().isBigEndian(); if (isBigEndian) std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; return SDValue( DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); } static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl & Results, SelectionDAG &DAG) { assert(N->getValueType(0) == MVT::i64 && "AtomicCmpSwap on types less than 64 should be legal"); SDValue Ops[] = {N->getOperand(1), createGPRPairNode(DAG, N->getOperand(2)), createGPRPairNode(DAG, N->getOperand(3)), N->getOperand(0)}; SDNode *CmpSwap = DAG.getMachineNode( ARM::CMP_SWAP_64, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); MachineMemOperand *MemOp = cast(N)->getMemOperand(); DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); bool isBigEndian = DAG.getDataLayout().isBigEndian(); Results.push_back( DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back( DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 2)); } static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, SelectionDAG &DAG) { const auto &TLI = DAG.getTargetLoweringInfo(); assert(Subtarget.getTargetTriple().isOSMSVCRT() && "Custom lowering is MSVCRT specific!"); SDLoc dl(Op); SDValue Val = Op.getOperand(0); MVT Ty = Val->getSimpleValueType(0); SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", TLI.getPointerTy(DAG.getDataLayout())); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Val; Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); Entry.IsZExt = true; Args.push_back(Entry); Entry.Node = Exponent; Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); Entry.IsZExt = true; Args.push_back(Entry); Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); // In the in-chain to the call is the entry node If we are emitting a // tailcall, the chain will be mutated if the node has a non-entry input // chain. SDValue InChain = DAG.getEntryNode(); SDValue TCChain = InChain; const Function &F = DAG.getMachineFunction().getFunction(); bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && F.getReturnType() == LCRTy; if (IsTC) InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(InChain) .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) .setTailCall(IsTC); std::pair CI = TLI.LowerCallTo(CLI); // Return the chain (the DAG root) if it is a tail call return !CI.second.getNode() ? DAG.getRoot() : CI.first; } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); - case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); case ISD::SREM: return LowerREM(Op.getNode(), DAG); case ISD::UREM: return LowerREM(Op.getNode(), DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ true); return LowerSDIV(Op, DAG, Subtarget); case ISD::UDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ false); return LowerUDIV(Op, DAG, Subtarget); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::SADDO: case ISD::SSUBO: return LowerSignedALUO(Op, DAG); case ISD::UADDO: case ISD::USUBO: return LowerUnsignedALUO(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: if (Subtarget->isTargetWindows()) return LowerDYNAMIC_STACKALLOC(Op, DAG); llvm_unreachable("Don't know how to custom lower this!"); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); } } static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); unsigned Opc = 0; if (IntNo == Intrinsic::arm_smlald) Opc = ARMISD::SMLALD; else if (IntNo == Intrinsic::arm_smlaldx) Opc = ARMISD::SMLALDX; else if (IntNo == Intrinsic::arm_smlsld) Opc = ARMISD::SMLSLD; else if (IntNo == Intrinsic::arm_smlsldx) Opc = ARMISD::SMLSLDX; else return; SDLoc dl(N); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), DAG.getConstant(1, dl, MVT::i32)); SDValue LongMul = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::i32), N->getOperand(1), N->getOperand(2), Lo, Hi); Results.push_back(LongMul.getValue(0)); Results.push_back(LongMul.getValue(1)); } /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); case ISD::READ_REGISTER: ExpandREAD_REGISTER(N, Results, DAG); break; case ISD::BITCAST: Res = ExpandBITCAST(N, DAG, Subtarget); break; case ISD::SRL: case ISD::SRA: case ISD::SHL: Res = Expand64BitShift(N, DAG, Subtarget); break; case ISD::SREM: case ISD::UREM: Res = LowerREM(N, DAG); break; case ISD::SDIVREM: case ISD::UDIVREM: Res = LowerDivRem(SDValue(N, 0), DAG); assert(Res.getNumOperands() == 2 && "DivRem needs two values"); Results.push_back(Res.getValue(0)); Results.push_back(Res.getValue(1)); return; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; case ISD::UDIV: case ISD::SDIV: assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, Results); case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_64Results(N, Results, DAG); return; case ISD::INTRINSIC_WO_CHAIN: return ReplaceLongIntrinsic(N, Results, DAG); case ISD::ABS: lowerABS(N, Results, DAG); return ; } if (Res.getNode()) Results.push_back(Res); } //===----------------------------------------------------------------------===// // ARM Scheduler Hooks //===----------------------------------------------------------------------===// /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and /// registers the function context. void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported with SjLj"); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineConstantPool *MCP = MF->getConstantPool(); ARMFunctionInfo *AFI = MF->getInfo(); const Function &F = MF->getFunction(); bool isThumb = Subtarget->isThumb(); bool isThumb2 = Subtarget->isThumb2(); unsigned PCLabelId = AFI->createPICLabelUId(); unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { // Incoming value: jbuf // ldr.n r5, LCPI1_1 // orr r5, r5, #1 // add r5, pc // str r5, [$jbuf, #+4] ; &jbuf[1] Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); // Set the low bit because of thumb mode. Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(0x01) .add(predOps(ARMCC::AL)) .add(condCodeOp()); Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) .addReg(NewVReg2, RegState::Kill) .addImm(PCLabelId); BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) .addReg(NewVReg3, RegState::Kill) .addFrameIndex(FI) .addImm(36) // &jbuf[1] :: pc .addMemOperand(FIMMOSt) .add(predOps(ARMCC::AL)); } else if (isThumb) { // Incoming value: jbuf // ldr.n r1, LCPI1_4 // add r1, pc // mov r2, #1 // orrs r1, r2 // add r2, $jbuf, #+4 ; &jbuf[1] // str r1, [r2] Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId); // Set the low bit because of thumb mode. Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) .addReg(ARM::CPSR, RegState::Define) .addImm(1) .add(predOps(ARMCC::AL)); Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3, RegState::Kill) .add(predOps(ARMCC::AL)); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) .addFrameIndex(FI) .addImm(36); // &jbuf[1] :: pc BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg5, RegState::Kill) .addImm(0) .addMemOperand(FIMMOSt) .add(predOps(ARMCC::AL)); } else { // Incoming value: jbuf // ldr r1, LCPI1_1 // add r1, pc, r1 // str r1, [$jbuf, #+4] ; &jbuf[1] Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) .addConstantPoolIndex(CPI) .addImm(0) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId) .add(predOps(ARMCC::AL)); BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) .addReg(NewVReg2, RegState::Kill) .addFrameIndex(FI) .addImm(36) // &jbuf[1] :: pc .addMemOperand(FIMMOSt) .add(predOps(ARMCC::AL)); } } void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineFrameInfo &MFI = MF->getFrameInfo(); int FI = MFI.getFunctionContextIndex(); const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass : &ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. for (MachineBasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { if (!II->isEHLabel()) continue; MCSymbol *Sym = II->getOperand(0).getMCSymbol(); if (!MF->hasCallSiteLandingPad(Sym)) continue; SmallVectorImpl &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); for (SmallVectorImpl::iterator CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); CSI != CSE; ++CSI) { CallSiteNumToLPad[*CSI].push_back(&*BB); MaxCSNum = std::max(MaxCSNum, *CSI); } break; } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned I = 1; I <= MaxCSNum; ++I) { SmallVectorImpl &MBBList = CallSiteNumToLPad[I]; for (SmallVectorImpl::iterator II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { LPadList.push_back(*II); InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // Create the jump table and associated information. MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); unsigned MJTI = JTI->createJumpTableIndex(LPadList); // Create the MBBs for the dispatch code. // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); unsigned trap_opcode; if (Subtarget->isThumb()) trap_opcode = ARM::tTRAP; else trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; BuildMI(TrapBB, dl, TII->get(trap_opcode)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert and MBBs. MF->insert(MF->end(), DispatchBB); MF->insert(MF->end(), DispContBB); MF->insert(MF->end(), TrapBB); // Insert code into the entry block that creates and registers the function // context. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); const ARMBaseInstrInfo *AII = static_cast(TII); const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. This can't work if the dispatch block // is in a Thumb1 function and is linked with ARM code which uses the FP // registers, as there is no way to preserve the FP registers in Thumb1 mode. MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); bool IsPositionIndependent = isPositionIndependent(); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) .addMemOperand(FIMMOLd) .add(predOps(ARMCC::AL)); if (NumLPads < 256) { BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) .addReg(NewVReg1) .addImm(LPadList.size()) .add(predOps(ARMCC::AL)); } else { Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) .addReg(VReg1) .addImm(NumLPads >> 16) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) .addReg(NewVReg1) .addReg(VReg2) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg1) .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) .addFrameIndex(FI) .addImm(1) .addMemOperand(FIMMOLd) .add(predOps(ARMCC::AL)); if (NumLPads < 256) { BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) .addReg(NewVReg1) .addImm(NumLPads) .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) .add(predOps(ARMCC::AL)); BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) .addReg(NewVReg1) .addReg(VReg1) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg1) .addImm(2) .add(predOps(ARMCC::AL)); Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3) .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) .addReg(NewVReg4, RegState::Kill) .addImm(0) .addMemOperand(JTMMOLd) .add(predOps(ARMCC::AL)); unsigned NewVReg6 = NewVReg5; if (IsPositionIndependent) { NewVReg6 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg3) .add(predOps(ARMCC::AL)); } BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) .addReg(NewVReg6, RegState::Kill) .addJumpTableIndex(MJTI); } else { Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) .addMemOperand(FIMMOLd) .add(predOps(ARMCC::AL)); if (NumLPads < 256) { BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) .addReg(NewVReg1) .addImm(NumLPads) .add(predOps(ARMCC::AL)); } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) .addReg(VReg1) .addImm(NumLPads >> 16) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) .addReg(NewVReg1) .addReg(VReg2) .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) .addReg(NewVReg1) .addReg(VReg1, RegState::Kill) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg4) .addImm(0) .addMemOperand(JTMMOLd) .add(predOps(ARMCC::AL)); if (IsPositionIndependent) { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg4) .addJumpTableIndex(MJTI); } else { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) .addReg(NewVReg5, RegState::Kill) .addJumpTableIndex(MJTI); } } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (std::vector::iterator I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { MachineBasicBlock *CurMBB = *I; if (SeenMBBs.insert(CurMBB).second) DispContBB->addSuccessor(CurMBB); } // N.B. the order the invoke BBs are processed in doesn't matter here. const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); SmallVector MBBLPads; for (MachineBasicBlock *BB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. SmallVector Successors(BB->succ_begin(), BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); if (SMBB->isEHPad()) { BB->removeSuccessor(SMBB); MBBLPads.push_back(SMBB); } } BB->addSuccessor(DispatchBB, BranchProbability::getZero()); BB->normalizeSuccProbs(); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (MachineBasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { if (!II->isCall()) continue; DenseMap DefRegs; for (MachineInstr::mop_iterator OI = II->operands_begin(), OE = II->operands_end(); OI != OE; ++OI) { if (!OI->isReg()) continue; DefRegs[OI->getReg()] = true; } MachineInstrBuilder MIB(*MF, &*II); for (unsigned i = 0; SavedRegs[i] != 0; ++i) { unsigned Reg = SavedRegs[i]; if (Subtarget->isThumb2() && !ARM::tGPRRegClass.contains(Reg) && !ARM::hGPRRegClass.contains(Reg)) continue; if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) continue; if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) continue; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (SmallVectorImpl::iterator I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) (*I)->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); } static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) if (*I != Succ) return *I; llvm_unreachable("Expecting a BB with two successors!"); } /// Return the load opcode for a given load size. If load size >= 8, /// neon opcode will be returned. static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { if (LdSize >= 8) return LdSize == 16 ? ARM::VLD1q32wb_fixed : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; if (IsThumb1) return LdSize == 4 ? ARM::tLDRi : LdSize == 2 ? ARM::tLDRHi : LdSize == 1 ? ARM::tLDRBi : 0; if (IsThumb2) return LdSize == 4 ? ARM::t2LDR_POST : LdSize == 2 ? ARM::t2LDRH_POST : LdSize == 1 ? ARM::t2LDRB_POST : 0; return LdSize == 4 ? ARM::LDR_POST_IMM : LdSize == 2 ? ARM::LDRH_POST : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; } /// Return the store opcode for a given store size. If store size >= 8, /// neon opcode will be returned. static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { if (StSize >= 8) return StSize == 16 ? ARM::VST1q32wb_fixed : StSize == 8 ? ARM::VST1d32wb_fixed : 0; if (IsThumb1) return StSize == 4 ? ARM::tSTRi : StSize == 2 ? ARM::tSTRHi : StSize == 1 ? ARM::tSTRBi : 0; if (IsThumb2) return StSize == 4 ? ARM::t2STR_POST : StSize == 2 ? ARM::t2STRH_POST : StSize == 1 ? ARM::t2STRB_POST : 0; return StSize == 4 ? ARM::STR_POST_IMM : StSize == 2 ? ARM::STRH_POST : StSize == 1 ? ARM::STRB_POST_IMM : 0; } /// Emit a post-increment load operation with given size. The instructions /// will be added to BB at Pos. static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); assert(LdOpc != 0 && "Should have a load opcode"); if (LdSize >= 8) { BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define) .addReg(AddrIn) .addImm(0) .add(predOps(ARMCC::AL)); } else if (IsThumb1) { // load + update AddrIn BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrIn) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) .add(t1CondCodeOp()) .addReg(AddrIn) .addImm(LdSize) .add(predOps(ARMCC::AL)); } else if (IsThumb2) { BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define) .addReg(AddrIn) .addImm(LdSize) .add(predOps(ARMCC::AL)); } else { // arm BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define) .addReg(AddrIn) .addReg(0) .addImm(LdSize) .add(predOps(ARMCC::AL)); } } /// Emit a post-increment store operation with given size. The instructions /// will be added to BB at Pos. static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); assert(StOpc != 0 && "Should have a store opcode"); if (StSize >= 8) { BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(AddrIn) .addImm(0) .addReg(Data) .add(predOps(ARMCC::AL)); } else if (IsThumb1) { // store + update AddrIn BuildMI(*BB, Pos, dl, TII->get(StOpc)) .addReg(Data) .addReg(AddrIn) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) .add(t1CondCodeOp()) .addReg(AddrIn) .addImm(StSize) .add(predOps(ARMCC::AL)); } else if (IsThumb2) { BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(Data) .addReg(AddrIn) .addImm(StSize) .add(predOps(ARMCC::AL)); } else { // arm BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(Data) .addReg(AddrIn) .addReg(0) .addImm(StSize) .add(predOps(ARMCC::AL)); } } MachineBasicBlock * ARMTargetLowering::EmitStructByval(MachineInstr &MI, MachineBasicBlock *BB) const { // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); Register dest = MI.getOperand(0).getReg(); Register src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); unsigned Align = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UnitSize = 0; const TargetRegisterClass *TRC = nullptr; const TargetRegisterClass *VecTRC = nullptr; bool IsThumb1 = Subtarget->isThumb1Only(); bool IsThumb2 = Subtarget->isThumb2(); bool IsThumb = Subtarget->isThumb(); if (Align & 1) { UnitSize = 1; } else if (Align & 2) { UnitSize = 2; } else { // Check whether we can use NEON instructions. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; else if ((Align % 8 == 0) && SizeVal >= 8) UnitSize = 8; } // Can't use NEON instructions. if (UnitSize == 0) UnitSize = 4; } // Select the correct opcode and register class for unit size load/store bool IsNeon = UnitSize >= 8; TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; if (IsNeon) VecTRC = UnitSize == 16 ? &ARM::DPairRegClass : UnitSize == 8 ? &ARM::DPRRegClass : nullptr; unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { // Use LDR and STR to copy. // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) // [destOut] = STR_POST(scratch, destIn, UnitSize) unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { Register srcOut = MRI.createVirtualRegister(TRC); Register destOut = MRI.createVirtualRegister(TRC); Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } // Handle the leftover bytes with LDRB and STRB. // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) for (unsigned i = 0; i < BytesLeft; i++) { Register srcOut = MRI.createVirtualRegister(TRC); Register destOut = MRI.createVirtualRegister(TRC); Register scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } MI.eraseFromParent(); // The instruction is gone now. return BB; } // Expand the pseudo op to a loop. // thisMBB: // ... // movw varEnd, # --> with thumb2 // movt varEnd, # // ldrcp varEnd, idx --> without thumb2 // fallthrough --> loopMBB // loopMBB: // PHI varPhi, varEnd, varLoop // PHI srcPhi, src, srcLoop // PHI destPhi, dst, destLoop // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSize) // subs varLoop, varPhi, #UnitSize // bne loopMBB // fallthrough --> exitMBB // exitMBB: // epilogue to handle left-over bytes // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, loopMBB); MF->insert(It, exitMBB); // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // Load an immediate to varEnd. Register varEnd = MRI.createVirtualRegister(TRC); if (Subtarget->useMovt()) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) .addImm(LoopSize & 0xFFFF) .add(predOps(ARMCC::AL)); if ((LoopSize & 0xFFFF0000) != 0) BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) .addReg(Vtmp) .addImm(LoopSize >> 16) .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, 4, 4); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .add(predOps(ARMCC::AL)) .addMemOperand(CPMMO); else BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) .add(predOps(ARMCC::AL)) .addMemOperand(CPMMO); } BB->addSuccessor(loopMBB); // Generate the loop body: // varPhi = PHI(varLoop, varEnd) // srcPhi = PHI(srcLoop, src) // destPhi = PHI(destLoop, dst) MachineBasicBlock *entryBB = BB; BB = loopMBB; Register varLoop = MRI.createVirtualRegister(TRC); Register varPhi = MRI.createVirtualRegister(TRC); Register srcLoop = MRI.createVirtualRegister(TRC); Register srcPhi = MRI.createVirtualRegister(TRC); Register destLoop = MRI.createVirtualRegister(TRC); Register destPhi = MRI.createVirtualRegister(TRC); BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) .addReg(varLoop).addMBB(loopMBB) .addReg(varEnd).addMBB(entryBB); BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) .addReg(srcLoop).addMBB(loopMBB) .addReg(src).addMBB(entryBB); BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) .addReg(destLoop).addMBB(loopMBB) .addReg(dest).addMBB(entryBB); // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, IsThumb1, IsThumb2); emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, IsThumb1, IsThumb2); // Decrement loop variable by UnitSize. if (IsThumb1) { BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) .add(t1CondCodeOp()) .addReg(varPhi) .addImm(UnitSize) .add(predOps(ARMCC::AL)); } else { MachineInstrBuilder MIB = BuildMI(*BB, BB->end(), dl, TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); MIB.addReg(varPhi) .addImm(UnitSize) .add(predOps(ARMCC::AL)) .add(condCodeOp()); MIB->getOperand(5).setReg(ARM::CPSR); MIB->getOperand(5).setIsDef(true); } BuildMI(*BB, BB->end(), dl, TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); // loopMBB can loop back to loopMBB or fall through to exitMBB. BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // Add epilogue to handle BytesLeft. BB = exitMBB; auto StartOfExit = exitMBB->begin(); // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { Register srcOut = MRI.createVirtualRegister(TRC); Register destOut = MRI.createVirtualRegister(TRC); Register scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } MI.eraseFromParent(); // The instruction is gone now. return BB; } MachineBasicBlock * ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, MachineBasicBlock *MBB) const { const TargetMachine &TM = getTargetMachine(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); assert(Subtarget->isTargetWindows() && "__chkstk is only supported on Windows"); assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); // __chkstk takes the number of words to allocate on the stack in R4, and // returns the stack adjustment in number of bytes in R4. This will not // clober any other registers (other than the obvious lr). // // Although, technically, IP should be considered a register which may be // clobbered, the call itself will not touch it. Windows on ARM is a pure // thumb-2 environment, so there is no interworking required. As a result, we // do not expect a veneer to be emitted by the linker, clobbering IP. // // Each module receives its own copy of __chkstk, so no import thunk is // required, again, ensuring that IP is not clobbered. // // Finally, although some linkers may theoretically provide a trampoline for // out of range calls (which is quite common due to a 32M range limitation of // branches for Thumb), we can generate the long-call version via // -mcmodel=large, alleviating the need for the trampoline which may clobber // IP. switch (TM.getCodeModel()) { case CodeModel::Tiny: llvm_unreachable("Tiny code model not available on ARM."); case CodeModel::Small: case CodeModel::Medium: case CodeModel::Kernel: BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) .add(predOps(ARMCC::AL)) .addExternalSymbol("__chkstk") .addReg(ARM::R4, RegState::Implicit | RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Define) .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead) .addReg(ARM::CPSR, RegState::Implicit | RegState::Define | RegState::Dead); break; case CodeModel::Large: { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) .add(predOps(ARMCC::AL)) .addReg(Reg, RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Define) .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead) .addReg(ARM::CPSR, RegState::Implicit | RegState::Define | RegState::Dead); break; } } BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) .addReg(ARM::SP, RegState::Kill) .addReg(ARM::R4, RegState::Kill) .setMIFlags(MachineInstr::FrameSetup) .add(predOps(ARMCC::AL)) .add(condCodeOp()); MI.eraseFromParent(); return MBB; } MachineBasicBlock * ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); MF->insert(++MBB->getIterator(), ContBB); ContBB->splice(ContBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); ContBB->transferSuccessorsAndUpdatePHIs(MBB); MBB->addSuccessor(ContBB); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); MF->push_back(TrapBB); MBB->addSuccessor(TrapBB); BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) .addReg(MI.getOperand(0).getReg()) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) .addMBB(TrapBB) .addImm(ARMCC::EQ) .addReg(ARM::CPSR); MI.eraseFromParent(); return ContBB; } // The CPSR operand of SelectItr might be missing a kill marker // because there were multiple uses of CPSR, and ISel didn't know // which to mark. Figure out whether SelectItr should have had a // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { // Scan forward through BB for a use/def of CPSR. MachineBasicBlock::iterator miI(std::next(SelectItr)); for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { const MachineInstr& mi = *miI; if (mi.readsRegister(ARM::CPSR)) return false; if (mi.definesRegister(ARM::CPSR)) break; // Should have kill-flag - update below. } // If we hit the end of the block, check whether CPSR is live into a // successor. if (miI == BB->end()) { for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), sEnd = BB->succ_end(); sItr != sEnd; ++sItr) { MachineBasicBlock* succ = *sItr; if (succ->isLiveIn(ARM::CPSR)) return false; } } // We found a def, or hit the end of the basic block and CPSR wasn't live // out. SelectMI should have a kill flag on CPSR. SelectItr->addRegisterKilled(ARM::CPSR, TRI); return true; } MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI.getOpcode()) { default: { MI.print(errs()); llvm_unreachable("Unexpected instr type to insert"); } // Thumb1 post-indexed loads are really just single-register LDMs. case ARM::tLDR_postidx: { MachineOperand Def(MI.getOperand(1)); BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) .add(Def) // Rn_wb .add(MI.getOperand(2)) // Rn .add(MI.getOperand(3)) // PredImm .add(MI.getOperand(4)) // PredReg .add(MI.getOperand(0)) // Rt .cloneMemRefs(MI); MI.eraseFromParent(); return BB; } // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. case ARM::t2STR_preidx: MI.setDesc(TII->get(ARM::t2STR_PRE)); return BB; case ARM::t2STRB_preidx: MI.setDesc(TII->get(ARM::t2STRB_PRE)); return BB; case ARM::t2STRH_preidx: MI.setDesc(TII->get(ARM::t2STRH_PRE)); return BB; case ARM::STRi_preidx: case ARM::STRBi_preidx: { unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; // Decode the offset. unsigned Offset = MI.getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; Offset = ARM_AM::getAM2Offset(Offset); if (isSub) Offset = -Offset; MachineMemOperand *MMO = *MI.memoperands_begin(); BuildMI(*BB, MI, dl, TII->get(NewOpc)) .add(MI.getOperand(0)) // Rn_wb .add(MI.getOperand(1)) // Rt .add(MI.getOperand(2)) // Rn .addImm(Offset) // offset (skip GPR==zero_reg) .add(MI.getOperand(5)) // pred .add(MI.getOperand(6)) .addMemOperand(MMO); MI.eraseFromParent(); return BB; } case ARM::STRr_preidx: case ARM::STRBr_preidx: case ARM::STRH_preidx: { unsigned NewOpc; switch (MI.getOpcode()) { default: llvm_unreachable("unexpected opcode!"); case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); for (unsigned i = 0; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); MI.eraseFromParent(); return BB; } case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // Check whether CPSR is live past the tMOVCCr_pseudo. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (!MI.killsRegister(ARM::CPSR) && !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { copy0MBB->addLiveIn(ARM::CPSR); sinkMBB->addLiveIn(ARM::CPSR); } // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); BuildMI(BB, dl, TII->get(ARM::tBcc)) .addMBB(sinkMBB) .addImm(MI.getOperand(3).getImm()) .addReg(MI.getOperand(4).getReg()); // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB BB = copy0MBB; // Update machine-CFG edges BB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) .addReg(MI.getOperand(1).getReg()) .addMBB(copy0MBB) .addReg(MI.getOperand(2).getReg()) .addMBB(thisMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } case ARM::BCCi64: case ARM::BCCZi64: { // If there is an unconditional branch to the other successor, remove it. BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); // Compare both parts that make up the double comparison separately for // equality. bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; Register LHS1 = MI.getOperand(1).getReg(); Register LHS2 = MI.getOperand(2).getReg(); if (RHSisZero) { BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS1) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { Register RHS1 = MI.getOperand(3).getReg(); Register RHS2 = MI.getOperand(4).getReg(); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS1) .addReg(RHS1) .add(predOps(ARMCC::AL)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS2).addReg(RHS2) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); if (MI.getOperand(0).getImm() == ARMCC::NE) std::swap(destMBB, exitMBB); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); if (isThumb2) BuildMI(BB, dl, TII->get(ARM::t2B)) .addMBB(exitMBB) .add(predOps(ARMCC::AL)); else BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } case ARM::Int_eh_sjlj_setjmp: case ARM::Int_eh_sjlj_setjmp_nofp: case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: return BB; case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; case ARM::ABS: case ARM::t2ABS: { // To insert an ABS instruction, we have to insert the // diamond control-flow pattern. The incoming instruction knows the // source vreg to test against 0, the destination vreg to set, // the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. // It transforms // V1 = ABS V0 // into // V2 = MOVS V0 // BCC (branch to SinkBB if V0 >= 0) // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) // SinkBB: V1 = PHI(V2, V3) const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator BBI = ++BB->getIterator(); MachineFunction *Fn = BB->getParent(); MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); Fn->insert(BBI, RSBBB); Fn->insert(BBI, SinkBB); Register ABSSrcReg = MI.getOperand(1).getReg(); Register ABSDstReg = MI.getOperand(0).getReg(); bool ABSSrcKIll = MI.getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class Register NewRsbDstReg = MRI.createVirtualRegister( isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); SinkBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RSBBB); BB->addSuccessor(SinkBB); // fall through to SinkMBB RSBBB->addSuccessor(SinkBB); // insert a cmp at the end of BB BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(ABSSrcReg) .addImm(0) .add(predOps(ARMCC::AL)); // insert a bcc with opposite CC to ARMCC::MI at the end of BB BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); // insert rsbri in RSBBB // Note: BCC and rsbri will be converted into predicated rsbmi // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) .addImm(0) .add(predOps(ARMCC::AL)) .add(condCodeOp()); // insert PHI in SinkBB, // reuse ABSDstReg to not change uses of ABS instruction BuildMI(*SinkBB, SinkBB->begin(), dl, TII->get(ARM::PHI), ABSDstReg) .addReg(NewRsbDstReg).addMBB(RSBBB) .addReg(ABSSrcReg).addMBB(BB); // remove ABS instruction MI.eraseFromParent(); // return last added BB return SinkBB; } case ARM::COPY_STRUCT_BYVAL_I32: ++NumLoopByVals; return EmitStructByval(MI, BB); case ARM::WIN__CHKSTK: return EmitLowered__chkstk(MI, BB); case ARM::WIN__DBZCHK: return EmitLowered__dbzchk(MI, BB); } } /// Attaches vregs to MEMCPY that it will use as scratch registers /// when it is expanded into LDM/STM. This is done as a post-isel lowering /// instead of as a custom inserter because we need the use list from the SDNode. static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node) { bool isThumb1 = Subtarget->isThumb1Only(); DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineInstrBuilder MIB(*MF, MI); // If the new dst/src is unused mark it as dead. if (!Node->hasAnyUseOfValue(0)) { MI.getOperand(0).setIsDead(true); } if (!Node->hasAnyUseOfValue(1)) { MI.getOperand(1).setIsDead(true); } // The MEMCPY both defines and kills the scratch registers. for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { if (MI.getOpcode() == ARM::MEMCPY) { attachMEMCPYScratchRegs(Subtarget, MI, Node); return; } const MCInstrDesc *MCID = &MI.getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional // operand is still set to noreg. If needed, set the optional operand's // register to CPSR, and remove the redundant implicit def. // // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). // Rename pseudo opcodes. unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); unsigned ccOutIdx; if (NewOpc) { const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() && "converted opcode should be the same except for cc_out" " (and, on Thumb1, pred)"); MI.setDesc(*MCID); // Add the optional cc_out operand MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); // On Thumb1, move all input operands to the end, then add the predicate if (Subtarget->isThumb1Only()) { for (unsigned c = MCID->getNumOperands() - 4; c--;) { MI.addOperand(MI.getOperand(1)); MI.RemoveOperand(1); } // Restore the ties for (unsigned i = MI.getNumOperands(); i--;) { const MachineOperand& op = MI.getOperand(i); if (op.isReg() && op.isUse()) { int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); if (DefIdx != -1) MI.tieOperands(DefIdx, i); } } MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); ccOutIdx = 1; } else ccOutIdx = MCID->getNumOperands() - 1; } else ccOutIdx = MCID->getNumOperands() - 1; // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { assert(!NewOpc && "Optional cc_out operand required"); return; } // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it // since we already have an optional CPSR def. bool definesCPSR = false; bool deadCPSR = false; for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { definesCPSR = true; if (MO.isDead()) deadCPSR = true; MI.RemoveOperand(i); break; } } if (!definesCPSR) { assert(!NewOpc && "Optional cc_out operand required"); return; } assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); if (deadCPSR) { assert(!MI.getOperand(ccOutIdx).getReg() && "expect uninitialized optional cc_out operand"); // Thumb1 instructions must have the S bit even if the CPSR is dead. if (!Subtarget->isThumb1Only()) return; } // If this instruction was defined with an optional CPSR def and its dag node // had a live implicit CPSR def, then activate the optional CPSR def. MachineOperand &MO = MI.getOperand(ccOutIdx); MO.setReg(ARM::CPSR); MO.setIsDef(true); } //===----------------------------------------------------------------------===// // ARM Optimization Hooks //===----------------------------------------------------------------------===// // Helper function that checks if N is a null or all ones constant. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); } // Return true if N is conditionally 0 or all ones. // Detects these expressions where cc is an i1 value: // // (select cc 0, y) [AllOnes=0] // (select cc y, 0) [AllOnes=0] // (zext cc) [AllOnes=0] // (sext cc) [AllOnes=0/1] // (select cc -1, y) [AllOnes=1] // (select cc y, -1) [AllOnes=1] // // Invert is set when N is the null/all ones constant when CC is false. // OtherOp is set to the alternative value of N. static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG) { switch (N->getOpcode()) { default: return false; case ISD::SELECT: { CC = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); if (isZeroOrAllOnes(N1, AllOnes)) { Invert = false; OtherOp = N2; return true; } if (isZeroOrAllOnes(N2, AllOnes)) { Invert = true; OtherOp = N1; return true; } return false; } case ISD::ZERO_EXTEND: // (zext cc) can never be the all ones value. if (AllOnes) return false; LLVM_FALLTHROUGH; case ISD::SIGN_EXTEND: { SDLoc dl(N); EVT VT = N->getValueType(0); CC = N->getOperand(0); if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) return false; Invert = !AllOnes; if (AllOnes) // When looking for an AllOnes constant, N is an sext, and the 'other' // value is 0. OtherOp = DAG.getConstant(0, dl, VT); else if (N->getOpcode() == ISD::ZERO_EXTEND) // When looking for a 0 constant, N can be zext or sext. OtherOp = DAG.getConstant(1, dl, VT); else OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT); return true; } } } // Combine a constant select operand into its use: // // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) // // The transform is rejected if the select doesn't have a constant operand that // is null, or all ones when AllOnes is set. // // Also recognize sext/zext from i1: // // (add (zext cc), x) -> (select cc (add x, 1), x) // (add (sext cc), x) -> (select cc (add x, -1), x) // // These transformations eventually create predicated instructions. // // @param N The node to transform. // @param Slct The N operand that is a select. // @param OtherOp The other N operand (x above). // @param DCI Context. // @param AllOnes Require the select constant to be all ones instead of null. // @returns The new node, or SDValue() on failure. static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes = false) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); SDValue NonConstantVal; SDValue CCOp; bool SwapSelectOps; if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, NonConstantVal, DAG)) return SDValue(); // Slct is now know to be the desired identity constant when CC is true. SDValue TrueVal = OtherOp; SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal); // Unless SwapSelectOps says CC should be false. if (SwapSelectOps) std::swap(TrueVal, FalseVal); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal); } // Attempt combineSelectAndUse on each operand of a commutative operator N. static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (N0.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) return Result; if (N1.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) return Result; return SDValue(); } static bool IsVUZPShuffleNode(SDNode *N) { // VUZP shuffle node. if (N->getOpcode() == ARMISD::VUZP) return true; // "VUZP" on i32 is an alias for VTRN. if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) return true; return false; } static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Look for ADD(VUZP.0, VUZP.1). if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || N0 == N1) return SDValue(); // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. if (!N->getValueType(0).is64BitVector()) return SDValue(); // Generate vpadd. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDNode *Unzip = N0.getNode(); EVT VT = N->getValueType(0); SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, TLI.getPointerTy(DAG.getDataLayout()))); Ops.push_back(Unzip->getOperand(0)); Ops.push_back(Unzip->getOperand(1)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); } static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Check for two extended operands. if (!(N0.getOpcode() == ISD::SIGN_EXTEND && N1.getOpcode() == ISD::SIGN_EXTEND) && !(N0.getOpcode() == ISD::ZERO_EXTEND && N1.getOpcode() == ISD::ZERO_EXTEND)) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || N00 == N10) return SDValue(); // We only recognize Q register paddl here; this can't be reached until // after type legalization. if (!N00.getValueType().is64BitVector() || !N0.getValueType().is128BitVector()) return SDValue(); // Generate vpaddl. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); EVT VT = N->getValueType(0); SmallVector Ops; // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. unsigned Opcode; if (N0.getOpcode() == ISD::SIGN_EXTEND) Opcode = Intrinsic::arm_neon_vpaddls; else Opcode = Intrinsic::arm_neon_vpaddlu; Ops.push_back(DAG.getConstant(Opcode, dl, TLI.getPointerTy(DAG.getDataLayout()))); EVT ElemTy = N00.getValueType().getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, N00.getOperand(0), N00.getOperand(1)); Ops.push_back(Concat); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); } // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is // much easier to match. static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Only perform optimization if after legalize, and if NEON is available. We // also expected both operands to be BUILD_VECTORs. if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() || N0.getOpcode() != ISD::BUILD_VECTOR || N1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // Check output type since VPADDL operand elements can only be 8, 16, or 32. EVT VT = N->getValueType(0); if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) return SDValue(); // Check that the vector operands are of the right form. // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR // operands, where N is the size of the formed vector. // Each EXTRACT_VECTOR should have the same input vector and odd or even // index such that we have a pair wise add pattern. // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue Vec = N0->getOperand(0)->getOperand(0); SDNode *V = Vec.getNode(); unsigned nextIndex = 0; // For each operands to the ADD which are BUILD_VECTORs, // check to see if each of their operands are an EXTRACT_VECTOR with // the same vector and appropriate index. for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue ExtVec0 = N0->getOperand(i); SDValue ExtVec1 = N1->getOperand(i); // First operand is the vector, verify its the same. if (V != ExtVec0->getOperand(0).getNode() || V != ExtVec1->getOperand(0).getNode()) return SDValue(); // Second is the constant, verify its correct. ConstantSDNode *C0 = dyn_cast(ExtVec0->getOperand(1)); ConstantSDNode *C1 = dyn_cast(ExtVec1->getOperand(1)); // For the constant, we want to see all the even or all the odd. if (!C0 || !C1 || C0->getZExtValue() != nextIndex || C1->getZExtValue() != nextIndex+1) return SDValue(); // Increment index. nextIndex+=2; } else return SDValue(); } // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure // we're using the entire input vector, otherwise there's a size/legality // mismatch somewhere. if (nextIndex != Vec.getValueType().getVectorNumElements() || Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) return SDValue(); // Create VPADDL node. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); // Build operand list. SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, TLI.getPointerTy(DAG.getDataLayout()))); // Input is the vector. Ops.push_back(Vec); // Get widened type and narrowed type. MVT widenType; unsigned numElem = VT.getVectorNumElements(); EVT inputLaneType = Vec.getValueType().getVectorElementType(); switch (inputLaneType.getSimpleVT().SimpleTy) { case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; default: llvm_unreachable("Invalid vector element type for padd optimization."); } SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; return DAG.getNode(ExtOp, dl, VT, tmp); } static SDValue findMUL_LOHI(SDValue V) { if (V->getOpcode() == ISD::UMUL_LOHI || V->getOpcode() == ISD::SMUL_LOHI) return V; return SDValue(); } static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (Subtarget->isThumb()) { if (!Subtarget->hasDSP()) return SDValue(); } else if (!Subtarget->hasV5TEOps()) return SDValue(); // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and // accumulates the product into a 64-bit value. The 16-bit values will // be sign extended somehow or SRA'd into 32-bit values // (addc (adde (mul 16bit, 16bit), lo), hi) SDValue Mul = AddcNode->getOperand(0); SDValue Lo = AddcNode->getOperand(1); if (Mul.getOpcode() != ISD::MUL) { Lo = AddcNode->getOperand(0); Mul = AddcNode->getOperand(1); if (Mul.getOpcode() != ISD::MUL) return SDValue(); } SDValue SRA = AddeNode->getOperand(0); SDValue Hi = AddeNode->getOperand(1); if (SRA.getOpcode() != ISD::SRA) { SRA = AddeNode->getOperand(1); Hi = AddeNode->getOperand(0); if (SRA.getOpcode() != ISD::SRA) return SDValue(); } if (auto Const = dyn_cast(SRA.getOperand(1))) { if (Const->getZExtValue() != 31) return SDValue(); } else return SDValue(); if (SRA.getOperand(0) != Mul) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(AddcNode); unsigned Opcode = 0; SDValue Op0; SDValue Op1; if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { Opcode = ARMISD::SMLALBB; Op0 = Mul.getOperand(0); Op1 = Mul.getOperand(1); } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { Opcode = ARMISD::SMLALBT; Op0 = Mul.getOperand(0); Op1 = Mul.getOperand(1).getOperand(0); } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { Opcode = ARMISD::SMLALTB; Op0 = Mul.getOperand(0).getOperand(0); Op1 = Mul.getOperand(1); } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { Opcode = ARMISD::SMLALTT; Op0 = Mul->getOperand(0).getOperand(0); Op1 = Mul->getOperand(1).getOperand(0); } if (!Op0 || !Op1) return SDValue(); SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), Op0, Op1, Lo, Hi); // Replace the ADDs' nodes uses by the MLA node's values. SDValue HiMLALResult(SMLAL.getNode(), 1); SDValue LoMLALResult(SMLAL.getNode(), 0); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); // Return original node to notify the driver to stop replacing. SDValue resNode(AddcNode, 0); return resNode; } static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is // a glue link from the first add to the second add. // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by // a S/UMLAL instruction. // UMUL_LOHI // / :lo \ :hi // V \ [no multiline comment] // loAdd -> ADDC | // \ :carry / // V V // ADDE <- hiAdd // // In the special case where only the higher part of a signed result is used // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts // a constant with the exact value of 0x80000000, we recognize we are dealing // with a "rounded multiply and add" (or subtract) and transform it into // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || AddeSubeNode->getOpcode() == ARMISD::SUBE) && "Expect an ADDE or SUBE"); assert(AddeSubeNode->getNumOperands() == 3 && AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && "ADDE node has the wrong inputs"); // Check that we are chained to the right ADDC or SUBC node. SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && AddcSubcNode->getOpcode() != ARMISD::ADDC) || (AddeSubeNode->getOpcode() == ARMISD::SUBE && AddcSubcNode->getOpcode() != ARMISD::SUBC)) return SDValue(); SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); // Check if the two operands are from the same mul_lohi node. if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) return SDValue(); assert(AddcSubcNode->getNumValues() == 2 && AddcSubcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it // maybe a SMLAL which multiplies two 16-bit values. if (AddeSubeNode->getOpcode() == ARMISD::ADDE && AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); // Check for the triangle shape. SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); // Make sure that the ADDE/SUBE operands are not coming from the same node. if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) return SDValue(); // Find the MUL_LOHI node walking up ADDE/SUBE's operands. bool IsLeftOperandMUL = false; SDValue MULOp = findMUL_LOHI(AddeSubeOp0); if (MULOp == SDValue()) MULOp = findMUL_LOHI(AddeSubeOp1); else IsLeftOperandMUL = true; if (MULOp == SDValue()) return SDValue(); // Figure out the right opcode. unsigned Opc = MULOp->getOpcode(); unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; // Figure out the high and low input values to the MLAL node. SDValue *HiAddSub = nullptr; SDValue *LoMul = nullptr; SDValue *LowAddSub = nullptr; // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) return SDValue(); if (IsLeftOperandMUL) HiAddSub = &AddeSubeOp1; else HiAddSub = &AddeSubeOp0; // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node // whose low result is fed to the ADDC/SUBC we are checking. if (AddcSubcOp0 == MULOp.getValue(0)) { LoMul = &AddcSubcOp0; LowAddSub = &AddcSubcOp1; } if (AddcSubcOp1 == MULOp.getValue(0)) { LoMul = &AddcSubcOp1; LowAddSub = &AddcSubcOp0; } if (!LoMul) return SDValue(); // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC // the replacement below will create a cycle. if (AddcSubcNode == HiAddSub->getNode() || AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) return SDValue(); // Create the merged node. SelectionDAG &DAG = DCI.DAG; // Start building operand list. SmallVector Ops; Ops.push_back(LoMul->getOperand(0)); Ops.push_back(LoMul->getOperand(1)); // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be // the case, we must be doing signed multiplication and only use the higher // part of the result of the MLAL, furthermore the LowAddSub must be a constant // addition or subtraction with the value of 0x800000. if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && LowAddSub->getNode()->getOpcode() == ISD::Constant && static_cast(LowAddSub->getNode())->getZExtValue() == 0x80000000) { Ops.push_back(*HiAddSub); if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { FinalOpc = ARMISD::SMMLSR; } else { FinalOpc = ARMISD::SMMLAR; } SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); return SDValue(AddeSubeNode, 0); } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) // SMMLS is generated during instruction selection and the rest of this // function can not handle the case where AddcSubcNode is a SUBC. return SDValue(); // Finish building the operand list for {U/S}MLAL Ops.push_back(*LowAddSub); Ops.push_back(*HiAddSub); SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), DAG.getVTList(MVT::i32, MVT::i32), Ops); // Replace the ADDs' nodes uses by the MLA node's values. SDValue HiMLALResult(MLALNode.getNode(), 1); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); SDValue LoMLALResult(MLALNode.getNode(), 0); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); // Return original node to notify the driver to stop replacing. return SDValue(AddeSubeNode, 0); } static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // UMAAL is similar to UMLAL except that it adds two unsigned values. // While trying to combine for the other MLAL nodes, first search for the // chance to use UMAAL. Check if Addc uses a node which has already // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde // as the addend, and it's handled in PerformUMLALCombine. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); // Check that we have a glued ADDC node. SDNode* AddcNode = AddeNode->getOperand(2).getNode(); if (AddcNode->getOpcode() != ARMISD::ADDC) return SDValue(); // Find the converted UMAAL or quit if it doesn't exist. SDNode *UmlalNode = nullptr; SDValue AddHi; if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { UmlalNode = AddcNode->getOperand(0).getNode(); AddHi = AddcNode->getOperand(1); } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { UmlalNode = AddcNode->getOperand(1).getNode(); AddHi = AddcNode->getOperand(0); } else { return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); } // The ADDC should be glued to an ADDE node, which uses the same UMLAL as // the ADDC as well as Zero. if (!isNullConstant(UmlalNode->getOperand(3))) return SDValue(); if ((isNullConstant(AddeNode->getOperand(0)) && AddeNode->getOperand(1).getNode() == UmlalNode) || (AddeNode->getOperand(0).getNode() == UmlalNode && isNullConstant(AddeNode->getOperand(1)))) { SelectionDAG &DAG = DCI.DAG; SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), UmlalNode->getOperand(2), AddHi }; SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), DAG.getVTList(MVT::i32, MVT::i32), Ops); // Replace the ADDs' nodes uses by the UMAAL node's values. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); // Return original node to notify the driver to stop replacing. return SDValue(AddeNode, 0); } return SDValue(); } static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) return SDValue(); // Check that we have a pair of ADDC and ADDE as operands. // Both addends of the ADDE must be zero. SDNode* AddcNode = N->getOperand(2).getNode(); SDNode* AddeNode = N->getOperand(3).getNode(); if ((AddcNode->getOpcode() == ARMISD::ADDC) && (AddeNode->getOpcode() == ARMISD::ADDE) && isNullConstant(AddeNode->getOperand(0)) && isNullConstant(AddeNode->getOperand(1)) && (AddeNode->getOperand(2).getNode() == AddcNode)) return DAG.getNode(ARMISD::UMAAL, SDLoc(N), DAG.getVTList(MVT::i32, MVT::i32), {N->getOperand(0), N->getOperand(1), AddcNode->getOperand(0), AddcNode->getOperand(1)}); else return SDValue(); } static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG(DCI.DAG); if (N->getOpcode() == ARMISD::SUBC) { // (SUBC (ADDE 0, 0, C), 1) -> C SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS->getOpcode() == ARMISD::ADDE && isNullConstant(LHS->getOperand(0)) && isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); } } if (Subtarget->isThumb1Only()) { SDValue RHS = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(RHS)) { int32_t imm = C->getSExtValue(); if (imm < 0 && imm > std::numeric_limits::min()) { SDLoc DL(N); RHS = DAG.getConstant(-imm, DL, MVT::i32); unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC : ARMISD::ADDC; return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); } } } return SDValue(); } static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (Subtarget->isThumb1Only()) { SelectionDAG &DAG = DCI.DAG; SDValue RHS = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(RHS)) { int64_t imm = C->getSExtValue(); if (imm < 0) { SDLoc DL(N); // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already // accounts for part of the negation. RHS = DAG.getConstant(~imm, DL, MVT::i32); unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE : ARMISD::ADDE; return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS, N->getOperand(2)); } } } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { return AddCombineTo64bitMLAL(N, DCI, Subtarget); } return SDValue(); } static SDValue PerformABSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SDValue res; SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) return SDValue(); if (!TLI.expandABS(N, res, DAG)) return SDValue(); return res; } /// PerformADDECombine - Target-specific dag combine transform from /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Only ARM and Thumb2 support UMLAL/SMLAL. if (Subtarget->isThumb1Only()) return PerformAddeSubeCombine(N, DCI, Subtarget); // Only perform the checks after legalize when the pattern is available. if (DCI.isBeforeLegalize()) return SDValue(); return AddCombineTo64bitUMAAL(N, DCI, Subtarget); } /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted /// operands. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget){ // Attempt to create vpadd for this add. if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) return Result; // Attempt to create vpaddl for this add. if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) return Result; if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, Subtarget)) return Result; // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) if (N0.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) return Result; return SDValue(); } bool ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { if (Level == BeforeLegalizeTypes) return true; if (N->getOpcode() != ISD::SHL) return true; if (Subtarget->isThumb1Only()) { // Avoid making expensive immediates by commuting shifts. (This logic // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted // for free.) if (N->getOpcode() != ISD::SHL) return true; SDValue N1 = N->getOperand(0); if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) return true; if (auto *Const = dyn_cast(N1->getOperand(1))) { if (Const->getAPIntValue().ult(256)) return false; if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && Const->getAPIntValue().sgt(-256)) return false; } return true; } // Turn off commute-with-shift transform after legalization, so it doesn't // conflict with PerformSHLSimplify. (We could try to detect when // PerformSHLSimplify would trigger more precisely, but it isn't // really necessary.) return false; } bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { if (!Subtarget->isThumb1Only()) return true; if (Level == BeforeLegalizeTypes) return true; return false; } bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { if (!Subtarget->hasNEON()) { if (Subtarget->isThumb1Only()) return VT.getScalarSizeInBits() <= 32; return true; } return VT.isScalarInteger(); } static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { // Allow the generic combiner to identify potential bswaps. if (DCI.isBeforeLegalize()) return SDValue(); // DAG combiner will fold: // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 // Other code patterns that can be also be modified have the following form: // b + ((a << 1) | 510) // b + ((a << 1) & 510) // b + ((a << 1) ^ 510) // b + ((a << 1) + 510) // Many instructions can perform the shift for free, but it requires both // the operands to be registers. If c1 << c2 is too large, a mov immediate // instruction will needed. So, unfold back to the original pattern if: // - if c1 and c2 are small enough that they don't require mov imms. // - the user(s) of the node can perform an shl // No shifted operands for 16-bit instructions. if (ST->isThumb() && ST->isThumb1Only()) return SDValue(); // Check that all the users could perform the shl themselves. for (auto U : N->uses()) { switch(U->getOpcode()) { default: return SDValue(); case ISD::SUB: case ISD::ADD: case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::SETCC: case ARMISD::CMP: // Check that the user isn't already using a constant because there // aren't any instructions that support an immediate operand and a // shifted operand. if (isa(U->getOperand(0)) || isa(U->getOperand(1))) return SDValue(); // Check that it's not already using a shift. if (U->getOperand(0).getOpcode() == ISD::SHL || U->getOperand(1).getOpcode() == ISD::SHL) return SDValue(); break; } } if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::SHL) return SDValue(); SDValue SHL = N->getOperand(0); auto *C1ShlC2 = dyn_cast(N->getOperand(1)); auto *C2 = dyn_cast(SHL.getOperand(1)); if (!C1ShlC2 || !C2) return SDValue(); APInt C2Int = C2->getAPIntValue(); APInt C1Int = C1ShlC2->getAPIntValue(); // Check that performing a lshr will not lose any information. APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), C2Int.getBitWidth() - C2->getZExtValue()); if ((C1Int & Mask) != C1Int) return SDValue(); // Shift the first constant. C1Int.lshrInPlace(C2Int); // The immediates are encoded as an 8-bit value that can be rotated. auto LargeImm = [](const APInt &Imm) { unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); return Imm.getBitWidth() - Zeros > 8; }; if (LargeImm(C1Int) || LargeImm(C2Int)) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue X = SHL.getOperand(0); SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, DAG.getConstant(C1Int, dl, MVT::i32)); // Shift left to compensate for the lshr of C1Int. SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); SHL.dump(); N->dump()); LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); return Res; } /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Only works one way, because it needs an immediate operand. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; // If that didn't work, try again with the operands commuted. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); } /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. /// static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) if (N1.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) return Result; return SDValue(); } /// PerformVMULCombine /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the /// special multiplier accumulator forwarding. /// vmul d3, d0, d2 /// vmla d3, d1, d2 /// is faster than /// vadd d3, d0, d1 /// vmul d3, d3, d2 // However, for (A + B) * (A + B), // vadd d2, d0, d1 // vmul d3, d0, d2 // vmla d3, d1, d2 // is slower than // vadd d2, d0, d1 // vmul d3, d2, d2 static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (!Subtarget->hasVMLxForwarding()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) { Opcode = N1.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) return SDValue(); std::swap(N0, N1); } if (N0 == N1) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); return DAG.getNode(Opcode, DL, VT, DAG.getNode(ISD::MUL, DL, VT, N00, N1), DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); int64_t MulAmt = C->getSExtValue(); unsigned ShiftAmt = countTrailingZeros(MulAmt); ShiftAmt = ShiftAmt & (32 - 1); SDValue V = N->getOperand(0); SDLoc DL(N); SDValue Res; MulAmt >>= ShiftAmt; if (MulAmt >= 0) { if (isPowerOf2_32(MulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) Res = DAG.getNode(ISD::ADD, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmt - 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmt + 1), DL, MVT::i32)), V); } else return SDValue(); } else { uint64_t MulAmtAbs = -MulAmt; if (isPowerOf2_32(MulAmtAbs + 1)) { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) Res = DAG.getNode(ISD::SUB, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmtAbs - 1)) { // (mul x, -(2^N + 1)) => - (add (shl x, N), x) Res = DAG.getNode(ISD::ADD, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, MVT::i32))); Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, MVT::i32), Res); } else return SDValue(); } if (ShiftAmt != 0) Res = DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Allow DAGCombine to pattern-match before we touch the canonical form. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); if (N->getValueType(0) != MVT::i32) return SDValue(); ConstantSDNode *N1C = dyn_cast(N->getOperand(1)); if (!N1C) return SDValue(); uint32_t C1 = (uint32_t)N1C->getZExtValue(); // Don't transform uxtb/uxth. if (C1 == 255 || C1 == 65535) return SDValue(); SDNode *N0 = N->getOperand(0).getNode(); if (!N0->hasOneUse()) return SDValue(); if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) return SDValue(); bool LeftShift = N0->getOpcode() == ISD::SHL; ConstantSDNode *N01C = dyn_cast(N0->getOperand(1)); if (!N01C) return SDValue(); uint32_t C2 = (uint32_t)N01C->getZExtValue(); if (!C2 || C2 >= 32) return SDValue(); // Clear irrelevant bits in the mask. if (LeftShift) C1 &= (-1U << C2); else C1 &= (-1U >> C2); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); // We have a pattern of the form "(and (shl x, c2) c1)" or // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to // transform to a pair of shifts, to save materializing c1. // First pattern: right shift, then mask off leading bits. // FIXME: Use demanded bits? if (!LeftShift && isMask_32(C1)) { uint32_t C3 = countLeadingZeros(C1); if (C2 < C3) { SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C3 - C2, DL, MVT::i32)); return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, DAG.getConstant(C3, DL, MVT::i32)); } } // First pattern, reversed: left shift, then mask off trailing bits. if (LeftShift && isMask_32(~C1)) { uint32_t C3 = countTrailingZeros(C1); if (C2 < C3) { SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C3 - C2, DL, MVT::i32)); return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, DAG.getConstant(C3, DL, MVT::i32)); } } // Second pattern: left shift, then mask off leading bits. // FIXME: Use demanded bits? if (LeftShift && isShiftedMask_32(C1)) { uint32_t Trailing = countTrailingZeros(C1); uint32_t C3 = countLeadingZeros(C1); if (Trailing == C2 && C2 + C3 < 32) { SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C2 + C3, DL, MVT::i32)); return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, DAG.getConstant(C3, DL, MVT::i32)); } } // Second pattern, reversed: right shift, then mask off trailing bits. // FIXME: Handle other patterns of known/demanded bits. if (!LeftShift && isShiftedMask_32(C1)) { uint32_t Leading = countLeadingZeros(C1); uint32_t C3 = countTrailingZeros(C1); if (Leading == C2 && C2 + C3 < 32) { SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C2 + C3, DL, MVT::i32)); return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, DAG.getConstant(C3, DL, MVT::i32)); } } // FIXME: Transform "(and (shl x, c2) c1)" -> // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than // c1. return SDValue(); } static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VbicVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); } } } if (!Subtarget->isThumb1Only()) { // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) return Result; if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; } if (Subtarget->isThumb1Only()) if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) return Result; return SDValue(); } // Try combining OR nodes to SMULWB, SMULWT. static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (!Subtarget->hasV6Ops() || (Subtarget->isThumb() && (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) return SDValue(); SDValue SRL = OR->getOperand(0); SDValue SHL = OR->getOperand(1); if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { SRL = OR->getOperand(1); SHL = OR->getOperand(0); } if (!isSRL16(SRL) || !isSHL16(SHL)) return SDValue(); // The first operands to the shifts need to be the two results from the // same smul_lohi node. if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) return SDValue(); SDNode *SMULLOHI = SRL.getOperand(0).getNode(); if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || SHL.getOperand(0) != SDValue(SMULLOHI, 1)) return SDValue(); // Now we have: // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. // For SMUWB the 16-bit value will signed extended somehow. // For SMULWT only the SRA is required. // Check both sides of SMUL_LOHI SDValue OpS16 = SMULLOHI->getOperand(0); SDValue OpS32 = SMULLOHI->getOperand(1); SelectionDAG &DAG = DCI.DAG; if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { OpS16 = OpS32; OpS32 = SMULLOHI->getOperand(0); } SDLoc dl(OR); unsigned Opcode = 0; if (isS16(OpS16, DAG)) Opcode = ARMISD::SMULWB; else if (isSRA16(OpS16)) { Opcode = ARMISD::SMULWT; OpS16 = OpS16->getOperand(0); } else return SDValue(); SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); return SDValue(OR, 0); } static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // BFI is only available on V6T2+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val // // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) // && mask == ~mask2 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) // && ~mask == mask2 // (i.e., copy a bitfield value into another bitfield of the same width) if (VT != MVT::i32) return SDValue(); SDValue N00 = N0.getOperand(0); // The value and the mask need to be constants so we can verify this is // actually a bitfield set. If the mask is 0xffff, we can do better // via a movt instruction, so don't use BFI in that case. SDValue MaskOp = N0.getOperand(1); ConstantSDNode *MaskC = dyn_cast(MaskOp); if (!MaskC) return SDValue(); unsigned Mask = MaskC->getZExtValue(); if (Mask == 0xffff) return SDValue(); SDValue Res; // Case (1): or (and A, mask), val => ARMbfi A, val, mask ConstantSDNode *N1C = dyn_cast(N1); if (N1C) { unsigned Val = N1C->getZExtValue(); if ((Val & ~Mask) != Val) return SDValue(); if (ARM::isBitFieldInvertedMask(Mask)) { Val >>= countTrailingZeros(~Mask); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, DAG.getConstant(Val, DL, MVT::i32), DAG.getConstant(Mask, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } } else if (N1.getOpcode() == ISD::AND) { // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); unsigned Mask2 = N11C->getZExtValue(); // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern // as is to match. if (ARM::isBitFieldInvertedMask(Mask) && (Mask == ~Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasDSP() && (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a unsigned amt = countTrailingZeros(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), DAG.getConstant(amt, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, DAG.getConstant(Mask, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } else if (ARM::isBitFieldInvertedMask(~Mask) && (~Mask == Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasDSP() && (Mask2 == 0xffff || Mask2 == 0xffff0000)) return SDValue(); // 2b unsigned lsb = countTrailingZeros(Mask); Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, DAG.getConstant(Mask2, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } } if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && N00.getOpcode() == ISD::SHL && isa(N00.getOperand(1)) && ARM::isBitFieldInvertedMask(~Mask)) { // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask // where lsb(mask) == #shamt and masked bits of B are known zero. SDValue ShAmt = N00.getOperand(1); unsigned ShAmtC = cast(ShAmt)->getZExtValue(); unsigned LSB = countTrailingZeros(Mask); if (ShAmtC != LSB) return SDValue(); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), DAG.getConstant(~Mask, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } return SDValue(); } static bool isValidMVECond(unsigned CC, bool IsFloat) { switch (CC) { case ARMCC::EQ: case ARMCC::NE: case ARMCC::LE: case ARMCC::GT: case ARMCC::GE: case ARMCC::LT: return true; case ARMCC::HS: case ARMCC::HI: return !IsFloat; default: return false; }; } static SDValue PerformORCombine_i1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain // together with predicates EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ARMCC::CondCodes CondCode0 = ARMCC::AL; ARMCC::CondCodes CondCode1 = ARMCC::AL; if (N0->getOpcode() == ARMISD::VCMP) CondCode0 = (ARMCC::CondCodes)cast(N0->getOperand(2)) ->getZExtValue(); else if (N0->getOpcode() == ARMISD::VCMPZ) CondCode0 = (ARMCC::CondCodes)cast(N0->getOperand(1)) ->getZExtValue(); if (N1->getOpcode() == ARMISD::VCMP) CondCode1 = (ARMCC::CondCodes)cast(N1->getOperand(2)) ->getZExtValue(); else if (N1->getOpcode() == ARMISD::VCMPZ) CondCode1 = (ARMCC::CondCodes)cast(N1->getOperand(1)) ->getZExtValue(); if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) return SDValue(); unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); if (!isValidMVECond(Opposite0, N0->getOperand(0)->getValueType(0).isFloatingPoint()) || !isValidMVECond(Opposite1, N1->getOperand(0)->getValueType(0).isFloatingPoint())) return SDValue(); SmallVector Ops0; Ops0.push_back(N0->getOperand(0)); if (N0->getOpcode() == ARMISD::VCMP) Ops0.push_back(N0->getOperand(1)); Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); SmallVector Ops1; Ops1.push_back(N1->getOperand(0)); if (N1->getOpcode() == ARMISD::VCMP) Ops1.push_back(N1->getOperand(1)); Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); } /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VORR BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VorrVT; SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VorrVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); } } } if (!Subtarget->isThumb1Only()) { // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) return Result; } SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { // The code below optimizes (or (and X, Y), Z). // The AND operand needs to have a single user to make these optimizations // profitable. if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); APInt SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; APInt SplatBits0, SplatBits1; BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); // Ensure that the second operand of both ands are constants if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, HasAnyUndefs) && !HasAnyUndefs) { if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, HasAnyUndefs) && !HasAnyUndefs) { // Ensure that the bit width of the constants are the same and that // the splat arguments are logical inverses as per the pattern we // are trying to simplify. if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && SplatBits0 == ~SplatBits1) { // Canonicalize the vector type to make instruction selection // simpler. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), N1->getOperand(0)); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } } } } if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) return PerformORCombine_i1(N, DCI, Subtarget); // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) return Res; } if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; return SDValue(); } static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); if (!Subtarget->isThumb1Only()) { // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; } return SDValue(); } // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and // their position in "to" (Rd). static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { assert(N->getOpcode() == ARMISD::BFI); SDValue From = N->getOperand(1); ToMask = ~cast(N->getOperand(2))->getAPIntValue(); FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); // If the Base came from a SHR #C, we can deduce that it is really testing bit // #C in the base of the SHR. if (From->getOpcode() == ISD::SRL && isa(From->getOperand(1))) { APInt Shift = cast(From->getOperand(1))->getAPIntValue(); assert(Shift.getLimitedValue() < 32 && "Shift too large!"); FromMask <<= Shift.getLimitedValue(31); From = From->getOperand(0); } return From; } // If A and B contain one contiguous set of bits, does A | B == A . B? // // Neither A nor B must be zero. static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { unsigned LastActiveBitInA = A.countTrailingZeros(); unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; return LastActiveBitInA - 1 == FirstActiveBitInB; } static SDValue FindBFIToCombineWith(SDNode *N) { // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, // if one exists. APInt ToMask, FromMask; SDValue From = ParseBFI(N, ToMask, FromMask); SDValue To = N->getOperand(0); // Now check for a compatible BFI to merge with. We can pass through BFIs that // aren't compatible, but not if they set the same bit in their destination as // we do (or that of any BFI we're going to combine with). SDValue V = To; APInt CombinedToMask = ToMask; while (V.getOpcode() == ARMISD::BFI) { APInt NewToMask, NewFromMask; SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); if (NewFrom != From) { // This BFI has a different base. Keep going. CombinedToMask |= NewToMask; V = V.getOperand(0); continue; } // Do the written bits conflict with any we've seen so far? if ((NewToMask & CombinedToMask).getBoolValue()) // Conflicting bits - bail out because going further is unsafe. return SDValue(); // Are the new bits contiguous when combined with the old bits? if (BitsProperlyConcatenate(ToMask, NewToMask) && BitsProperlyConcatenate(FromMask, NewFromMask)) return V; if (BitsProperlyConcatenate(NewToMask, ToMask) && BitsProperlyConcatenate(NewFromMask, FromMask)) return V; // We've seen a write to some bits, so track it. CombinedToMask |= NewToMask; // Keep going... V = V.getOperand(0); } return SDValue(); } static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); if (N1.getOpcode() == ISD::AND) { // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff // the bits being cleared by the AND are not demanded by the BFI. ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); unsigned InvMask = cast(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; assert(Width < static_cast(std::numeric_limits::digits) && "undefined behavior"); unsigned Mask = (1u << Width) - 1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. // Keep track of any consecutive bits set that all come from the same base // value. We can combine these together into a single BFI. SDValue CombineBFI = FindBFIToCombineWith(N); if (CombineBFI == SDValue()) return SDValue(); // We've found a BFI. APInt ToMask1, FromMask1; SDValue From1 = ParseBFI(N, ToMask1, FromMask1); APInt ToMask2, FromMask2; SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); assert(From1 == From2); (void)From2; // First, unlink CombineBFI. DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); // Then create a new BFI, combining the two together. APInt NewFromMask = FromMask1 | FromMask2; APInt NewToMask = ToMask1 | ToMask2; EVT VT = N->getValueType(0); SDLoc dl(N); if (NewFromMask[0] == 0) From1 = DCI.DAG.getNode( ISD::SRL, dl, VT, From1, DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); // vmovrrd(load f64) -> (load i32), (load i32) SDNode *InNode = InDouble.getNode(); if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && InNode->getValueType(0) == MVT::f64 && InNode->getOperand(1).getOpcode() == ISD::FrameIndex && !cast(InNode)->isVolatile()) { // TODO: Should this be done for non-FrameIndex operands? LoadSDNode *LD = cast(InNode); SelectionDAG &DAG = DCI.DAG; SDLoc DL(LD); SDValue BasePtr = LD->getBasePtr(); SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->getAlignment(), LD->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, LD->getPointerInfo().getWithOffset(4), std::min(4U, LD->getAlignment()), LD->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); if (DCI.DAG.getDataLayout().isBigEndian()) std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); return Result; } return SDValue(); } /// PerformVMOVDRRCombine - Target-specific dag combine xforms for /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() == ISD::BITCAST) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::BITCAST) Op1 = Op1.getOperand(0); if (Op0.getOpcode() == ARMISD::VMOVRRD && Op0.getNode() == Op1.getNode() && Op0.getResNo() == 0 && Op1.getResNo() == 1) return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0.getOperand(0)); return SDValue(); } /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded /// directly into a VFP register. static bool hasNormalLoadOperand(SDNode *N) { unsigned NumElts = N->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { SDNode *Elt = N->getOperand(i).getNode(); if (ISD::isNormalLoad(Elt) && !cast(Elt)->isVolatile()) return true; } return false; } /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for /// ISD::BUILD_VECTOR. static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value // into a pair of GPRs, which is fine when the value is used as a scalar, // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. SelectionDAG &DAG = DCI.DAG; if (N->getNumOperands() == 2) if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) return RV; // Load i64 elements as f64 values so that type legalization does not split // them up into i32 values. EVT VT = N->getValueType(0); if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) return SDValue(); SDLoc dl(N); SmallVector Ops; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); Ops.push_back(V); // Make the DAGCombiner fold the bitcast. DCI.AddToWorklist(V.getNode()); } EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, BV); } /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. // At that time, we may have inserted bitcasts from integer to float. // If these bitcasts have survived DAGCombine, change the lowering of this // BUILD_VECTOR in something more vector friendly, i.e., that does not // force to use floating point types. // Make sure we can change the type of the vector. // This is possible iff: // 1. The vector is only used in a bitcast to a integer type. I.e., // 1.1. Vector is used only once. // 1.2. Use is a bit convert to an integer type. // 2. The size of its operands are 32-bits (64-bits are not legal). EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); // Check 1.1. and 2. if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) return SDValue(); // By construction, the input type must be float. assert(EltVT == MVT::f32 && "Unexpected type!"); // Check 1.2. SDNode *Use = *N->use_begin(); if (Use->getOpcode() != ISD::BITCAST || Use->getValueType(0).isFloatingPoint()) return SDValue(); // Check profitability. // Model is, if more than half of the relevant operands are bitcast from // i32, turn the build_vector into a sequence of insert_vector_elt. // Relevant operands are everything that is not statically // (i.e., at compile time) bitcasted. unsigned NumOfBitCastedElts = 0; unsigned NumElts = VT.getVectorNumElements(); unsigned NumOfRelevantElts = NumElts; for (unsigned Idx = 0; Idx < NumElts; ++Idx) { SDValue Elt = N->getOperand(Idx); if (Elt->getOpcode() == ISD::BITCAST) { // Assume only bit cast to i32 will go away. if (Elt->getOperand(0).getValueType() == MVT::i32) ++NumOfBitCastedElts; } else if (Elt.isUndef() || isa(Elt)) // Constants are statically casted, thus do not count them as // relevant operands. --NumOfRelevantElts; } // Check if more than half of the elements require a non-free bitcast. if (NumOfBitCastedElts <= NumOfRelevantElts / 2) return SDValue(); SelectionDAG &DAG = DCI.DAG; // Create the new vector type. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); // Check if the type is legal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VecVT)) return SDValue(); // Combine: // ARMISD::BUILD_VECTOR E1, E2, ..., EN. // => BITCAST INSERT_VECTOR_ELT // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), // (BITCAST EN), N. SDValue Vec = DAG.getUNDEF(VecVT); SDLoc dl(N); for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { SDValue V = N->getOperand(Idx); if (V.isUndef()) continue; if (V.getOpcode() == ISD::BITCAST && V->getOperand(0).getValueType() == MVT::i32) // Fold obvious case. V = V.getOperand(0); else { V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(V.getNode()); } SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); } Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); return Vec; } /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Bitcast an i64 load inserted into a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. EVT VT = N->getValueType(0); SDNode *Elt = N->getOperand(1).getNode(); if (VT.getVectorElementType() != MVT::i64 || !ISD::isNormalLoad(Elt) || cast(Elt)->isVolatile()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VT.getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); DCI.AddToWorklist(V.getNode()); SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, Vec, V, N->getOperand(2)); return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { // The LLVM shufflevector instruction does not require the shuffle mask // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the // operands do not match the mask length, they are extended by concatenating // them with undef vectors. That is probably the right thing for other // targets, but for NEON it is better to concatenate two double-register // size vector operands into a single quad-register size vector. Do that // transformation here: // shuffle(concat(v1, undef), concat(v2, undef)) -> // shuffle(concat(v1, v2), undef) SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() != ISD::CONCAT_VECTORS || Op1.getOpcode() != ISD::CONCAT_VECTORS || Op0.getNumOperands() != 2 || Op1.getNumOperands() != 2) return SDValue(); SDValue Concat0Op1 = Op0.getOperand(1); SDValue Concat1Op1 = Op1.getOperand(1); if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) return SDValue(); // Skip the transformation if any of the types are illegal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(Concat0Op1.getValueType()) || !TLI.isTypeLegal(Concat1Op1.getValueType())) return SDValue(); SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Op0.getOperand(0), Op1.getOperand(0)); // Translate the shuffle mask. SmallVector NewMask; unsigned NumElts = VT.getVectorNumElements(); unsigned HalfElts = NumElts/2; ShuffleVectorSDNode *SVN = cast(N); for (unsigned n = 0; n < NumElts; ++n) { int MaskElt = SVN->getMaskElt(n); int NewElt = -1; if (MaskElt < (int)HalfElts) NewElt = MaskElt; else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) NewElt = HalfElts + MaskElt - NumElts; NewMask.push_back(NewElt); } return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, DAG.getUNDEF(VT), NewMask); } /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, /// NEON load/store intrinsics, and generic vector load/stores, to merge /// base address updates. /// For generic load/stores, the memory type is assumed to be a vector. /// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); const bool isStore = N->getOpcode() == ISD::STORE; const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); MemSDNode *MemN = cast(N); SDLoc dl(N); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. We can avoid searching through Addr as it's a // predecessor to both. SmallPtrSet Visited; SmallVector Worklist; Visited.insert(Addr.getNode()); Worklist.push_back(N); Worklist.push_back(User); if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || SDNode::hasPredecessorHelper(User, Visited, Worklist)) continue; // Find the new opcode for the updating load/store. bool isLoadOp = true; bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; if (isIntrinsic) { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; NumVecs = 1; break; case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; NumVecs = 2; break; case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; NumVecs = 3; break; case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; NumVecs = 4; break; case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: // TODO: Support updating VLDxDUP nodes. For now, we just skip // combining base updates for such intrinsics. continue; case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; NumVecs = 2; isLaneOp = true; break; case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; NumVecs = 3; isLaneOp = true; break; case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; NumVecs = 4; isLaneOp = true; break; case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; NumVecs = 1; isLoadOp = false; break; case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; NumVecs = 2; isLoadOp = false; break; case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; NumVecs = 3; isLoadOp = false; break; case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; NumVecs = 4; isLoadOp = false; break; case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; NumVecs = 2; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; NumVecs = 3; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; NumVecs = 4; isLoadOp = false; isLaneOp = true; break; } } else { isLaneOp = true; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode for Neon base update"); case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; NumVecs = 1; isLaneOp = false; break; case ISD::STORE: NewOpc = ARMISD::VST1_UPD; NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } } // Find the size of memory referenced by the load/store. EVT VecTy; if (isLoadOp) { VecTy = N->getValueType(0); } else if (isIntrinsic) { VecTy = N->getOperand(AddrOpIdx+1).getValueType(); } else { assert(isStore && "Node has to be a load, a store, or an intrinsic!"); VecTy = N->getOperand(1).getValueType(); } unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); ConstantSDNode *CInc = dyn_cast(Inc.getNode()); if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two // separate instructions that make it harder to use a non-constant update. continue; } // OK, we found an ADD we can fold into the base update. // Now, create a _UPD node, taking care of not breaking alignment. EVT AlignedVecTy = VecTy; unsigned Alignment = MemN->getAlignment(); // If this is a less-than-standard-aligned load/store, change the type to // match the standard alignment. // The alignment is overlooked when selecting _UPD variants; and it's // easier to introduce bitcasts here than fix that. // There are 3 ways to get to this base-update combine: // - intrinsics: they are assumed to be properly aligned (to the standard // alignment of the memory type), so we don't need to do anything. // - ARMISD::VLDx nodes: they are only generated from the aforementioned // intrinsics, so, likewise, there's nothing to do. // - generic load/store instructions: the alignment is specified as an // explicit operand, rather than implicitly as the standard alignment // of the memory type (like the intrisics). We need to change the // memory type to match the explicit alignment. That way, we don't // generate non-standard-aligned ARMISD::VLDx nodes. if (isa(N)) { if (Alignment == 0) Alignment = 1; if (Alignment < VecTy.getScalarSizeInBits() / 8) { MVT EltTy = MVT::getIntegerVT(Alignment * 8); assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); assert(!isLaneOp && "Unexpected generic load/store lane."); unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); } // Don't set an explicit alignment on regular load/stores that we want // to transform to VLD/VST 1_UPD nodes. // This matches the behavior of regular load/stores, which only get an // explicit alignment if the MMO alignment is larger than the standard // alignment of the memory type. // Intrinsics, however, always get an explicit alignment, set to the // alignment of the MMO. Alignment = 1; } // Create the new updating load/store node. // First, create an SDVTList for the new updating node's results. EVT Tys[6]; unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); // Then, gather the new node's operands. SmallVector Ops; Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); if (StoreSDNode *StN = dyn_cast(N)) { // Try to match the intrinsic's signature Ops.push_back(StN->getValue()); } else { // Loads (and of course intrinsics) match the intrinsics' signature, // so just add all but the alignment operand. for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) Ops.push_back(N->getOperand(i)); } // For all node types, the alignment operand is always the last one. Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); // If this is a non-standard-aligned STORE, the penultimate operand is the // stored value. Bitcast it to the aligned type. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { SDValue &StVal = Ops[Ops.size()-2]; StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); } EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, MemN->getMemOperand()); // Update the uses. SmallVector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); // If this is an non-standard-aligned LOAD, the first result is the loaded // value. Bitcast it to the expected result type. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { SDValue &LdVal = NewResults[0]; LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); return CombineBaseUpdate(N, DCI); } /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and /// return true. static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); // vldN-dup instructions only support 64-bit vectors for N > 1. if (!VT.is64BitVector()) return false; // Check if the VDUPLANE operand is a vldN-dup intrinsic. SDNode *VLD = N->getOperand(0).getNode(); if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) return false; unsigned NumVecs = 0; unsigned NewOpc = 0; unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); if (IntNo == Intrinsic::arm_neon_vld2lane) { NumVecs = 2; NewOpc = ARMISD::VLD2DUP; } else if (IntNo == Intrinsic::arm_neon_vld3lane) { NumVecs = 3; NewOpc = ARMISD::VLD3DUP; } else if (IntNo == Intrinsic::arm_neon_vld4lane) { NumVecs = 4; NewOpc = ARMISD::VLD4DUP; } else { return false; } // First check that all the vldN-lane uses are VDUPLANEs and that the lane // numbers match the load. unsigned VLDLaneNo = cast(VLD->getOperand(NumVecs+3))->getZExtValue(); for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); UI != UE; ++UI) { // Ignore uses of the chain result. if (UI.getUse().getResNo() == NumVecs) continue; SDNode *User = *UI; if (User->getOpcode() != ARMISD::VDUPLANE || VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) return false; } // Create the vldN-dup node. EVT Tys[5]; unsigned n; for (n = 0; n < NumVecs; ++n) Tys[n] = VT; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast(VLD); SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, VLDMemInt->getMemoryVT(), VLDMemInt->getMemOperand()); // Update the uses. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); UI != UE; ++UI) { unsigned ResNo = UI.getUse().getResNo(); // Ignore uses of the chain result. if (ResNo == NumVecs) continue; SDNode *User = *UI; DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); } // Now the vldN-lane intrinsic is dead except for its chain result. // Update uses of the chain. std::vector VLDDupResults; for (unsigned n = 0; n < NumVecs; ++n) VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); DCI.CombineTo(VLD, VLDDupResults); return true; } /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue Op = N->getOperand(0); // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. if (CombineVLDDUP(N, DCI)) return SDValue(N, 0); // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is // redundant. Ignore bit_converts for now; element sizes are checked below. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) return SDValue(); // Make sure the VMOV element size is not bigger than the VDUPLANE elements. unsigned EltSize = Op.getScalarValueSizeInBits(); // The canonical VMOV for a zero vector uses a 32-bit element size. unsigned Imm = cast(Op.getOperand(0))->getZExtValue(); unsigned EltBits; if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) return SDValue(); return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. static SDValue PerformVDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); if (!Subtarget->hasNEON()) return SDValue(); // Match VDUP(LOAD) -> VLD1DUP. // We match this pattern here rather than waiting for isel because the // transform is only legal for unindexed loads. LoadSDNode *LD = dyn_cast(Op.getNode()); if (LD && Op.hasOneUse() && LD->isUnindexed() && LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops, LD->getMemoryVT(), LD->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); return VLDDup; } return SDValue(); } static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); // If this is a legal vector load, try to combine it into a VLD1_UPD. if (ISD::isNormalLoad(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); return SDValue(); } /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { StoreSDNode *St = cast(N); if (St->isVolatile()) return SDValue(); // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); if (St->isTruncatingStore() && VT.isVector()) { SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT StVT = St->getMemoryVT(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromEltSz = VT.getScalarSizeInBits(); unsigned ToEltSz = StVT.getScalarSizeInBits(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); unsigned SizeRatio = FromEltSz / ToEltSz; assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDLoc DL(St); SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 : i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) StoreType = Tp; } // Didn't find a legal store type. if (!TLI.isTypeLegal(StoreType)) return SDValue(); // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); SmallVector Chains; SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, TLI.getPointerTy(DAG.getDataLayout())); SDValue BasePtr = St->getBasePtr(); // Perform one or more big stores into memory. unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); for (unsigned I = 0; I < E; I++) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, ShuffWide, DAG.getIntPtrConstant(I, DL)); SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } if (!ISD::isNormalStore(St)) return SDValue(); // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and // ARM stores of arguments in the same cache line. if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; bool isBigEndian = DAG.getDataLayout().isBigEndian(); SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore( St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), BasePtr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), OffsetPtr, St->getPointerInfo(), std::min(4U, St->getAlignment() / 2), St->getMemOperand()->getFlags()); } if (StVal.getValueType() == MVT::i64 && StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { // Bitcast an i64 store extracted from a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. SelectionDAG &DAG = DCI.DAG; SDLoc dl(StVal); SDValue IntVec = StVal.getOperand(0); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, IntVec.getValueType().getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Vec, StVal.getOperand(1)); dl = SDLoc(N); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); DCI.AddToWorklist(ExtElt.getNode()); DCI.AddToWorklist(V.getNode()); return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo()); } // If this is a legal vector store, try to combine it into a VST1_UPD. if (ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); return SDValue(); } /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) /// can replace combinations of VMUL and VCVT (floating-point to integer) /// when the VMUL has a constant operand that is a power of 2. /// /// Example (assume d17 = ): /// vmul.f32 d16, d17, d16 /// vcvt.s32.f32 d16, d16 /// becomes: /// vcvt.s32.f32 d16, d16, #3 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would // be lossy. We also can't handle anything other than 2 or 4 lanes, since // these intructions only support v2i32/v4i32 types. return SDValue(); } BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); if (C == -1 || C == 0 || C > 32) return SDValue(); SDLoc dl(N); bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), DAG.getConstant(C, dl, MVT::i32)); if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; } /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) /// can replace combinations of VCVT (integer to floating-point) and VDIV /// when the VDIV has a constant operand that is a power of 2. /// /// Example (assume d17 = ): /// vcvt.f32.s32 d16, d16 /// vdiv.f32 d16, d17, d16 /// becomes: /// vcvt.f32.s32 d16, d16, #3 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would // be lossy. We also can't handle anything other than 2 or 4 lanes, since // these intructions only support v2i32/v4i32 types. return SDValue(); } BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); if (C == -1 || C == 0 || C > 32) return SDValue(); SDLoc dl(N); bool isSigned = OpOpcode == ISD::SINT_TO_FP; SDValue ConvInput = Op.getOperand(0); if (IntBits < FloatBits) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : Intrinsic::arm_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), ConvInput, DAG.getConstant(C, dl, MVT::i32)); } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: // Don't do anything for most intrinsics. break; // Vector shifts: check for immediate versions and lower them. // Note: This is done during DAG combining instead of DAG legalizing because // the build_vectors for 64-bit vector element shift counts are generally // not legal, and it is hard to see their values after they get legalized to // loads from a constant pool. case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshifts: case Intrinsic::arm_neon_vqshiftu: case Intrinsic::arm_neon_vqshiftsu: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: case Intrinsic::arm_neon_vqshiftnsu: case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: { EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; switch (IntNo) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { VShiftOpc = ARMISD::VSHLIMM; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); break; } return SDValue(); case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) break; return SDValue(); case Intrinsic::arm_neon_vqshifts: case Intrinsic::arm_neon_vqshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; return SDValue(); case Intrinsic::arm_neon_vqshiftsu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; llvm_unreachable("invalid shift count for vqshlu intrinsic"); case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: case Intrinsic::arm_neon_vqshiftnsu: case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; llvm_unreachable("invalid shift count for narrowing vector shift " "intrinsic"); default: llvm_unreachable("unhandled vector shift"); } switch (IntNo) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: VShiftOpc = ARMISD::VRSHRsIMM; break; case Intrinsic::arm_neon_vrshiftu: VShiftOpc = ARMISD::VRSHRuIMM; break; case Intrinsic::arm_neon_vrshiftn: VShiftOpc = ARMISD::VRSHRNIMM; break; case Intrinsic::arm_neon_vqshifts: VShiftOpc = ARMISD::VQSHLsIMM; break; case Intrinsic::arm_neon_vqshiftu: VShiftOpc = ARMISD::VQSHLuIMM; break; case Intrinsic::arm_neon_vqshiftsu: VShiftOpc = ARMISD::VQSHLsuIMM; break; case Intrinsic::arm_neon_vqshiftns: VShiftOpc = ARMISD::VQSHRNsIMM; break; case Intrinsic::arm_neon_vqshiftnu: VShiftOpc = ARMISD::VQSHRNuIMM; break; case Intrinsic::arm_neon_vqshiftnsu: VShiftOpc = ARMISD::VQSHRNsuIMM; break; case Intrinsic::arm_neon_vqrshiftns: VShiftOpc = ARMISD::VQRSHRNsIMM; break; case Intrinsic::arm_neon_vqrshiftnu: VShiftOpc = ARMISD::VQRSHRNuIMM; break; case Intrinsic::arm_neon_vqrshiftnsu: VShiftOpc = ARMISD::VQRSHRNsuIMM; break; } SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vshiftins: { EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) VShiftOpc = ARMISD::VSLIIMM; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) VShiftOpc = ARMISD::VSRIIMM; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), N->getOperand(2), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vqrshifts: case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; } return SDValue(); } /// PerformShiftCombine - Checks for immediate versions of vector shifts and /// lowers them. As with the vector shift intrinsics, this is done during DAG /// combining instead of DAG legalizing because the build_vectors for 64-bit /// vector element shift counts are generally not legal, and it is hard to see /// their values after they get legalized to loads from a constant pool. static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. SDValue N1 = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(N1)) { SDValue N0 = N->getOperand(0); if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && DAG.MaskedValueIsZero(N0.getOperand(0), APInt::getHighBitsSet(32, 16))) return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); } } if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && N->getOperand(0)->getOpcode() == ISD::AND && N->getOperand(0)->hasOneUse()) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't // usually show up because instcombine prefers to canonicalize it to // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come // out of GEP lowering in some cases. SDValue N0 = N->getOperand(0); ConstantSDNode *ShiftAmtNode = dyn_cast(N->getOperand(1)); if (!ShiftAmtNode) return SDValue(); uint32_t ShiftAmt = static_cast(ShiftAmtNode->getZExtValue()); ConstantSDNode *AndMaskNode = dyn_cast(N0->getOperand(1)); if (!AndMaskNode) return SDValue(); uint32_t AndMask = static_cast(AndMaskNode->getZExtValue()); // Don't transform uxtb/uxth. if (AndMask == 255 || AndMask == 65535) return SDValue(); if (isMask_32(AndMask)) { uint32_t MaskedBits = countLeadingZeros(AndMask); if (MaskedBits > ShiftAmt) { SDLoc DL(N); SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(MaskedBits, DL, MVT::i32)); return DAG.getNode( ISD::SRL, DL, MVT::i32, SHL, DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); } } } // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) return SDValue(); int64_t Cnt; switch (N->getOpcode()) { default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } } return SDValue(); } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue N0 = N->getOperand(0); // Check for sign- and zero-extensions of vector extract operations of 8- // and 16-bit vector elements. NEON supports these directly. They are // handled during DAG combining because type legalization will promote them // to 32-bit types and it is messy to recognize the operations after that. if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue Vec = N0.getOperand(0); SDValue Lane = N0.getOperand(1); EVT VT = N->getValueType(0); EVT EltVT = N0.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (VT == MVT::i32 && (EltVT == MVT::i8 || EltVT == MVT::i16) && TLI.isTypeLegal(Vec.getValueType()) && isa(Lane)) { unsigned Opc = 0; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode"); case ISD::SIGN_EXTEND: Opc = ARMISD::VGETLANEs; break; case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Opc = ARMISD::VGETLANEu; break; } return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); } } return SDValue(); } static const APInt *isPowerOf2Constant(SDValue V) { ConstantSDNode *C = dyn_cast(V); if (!C) return nullptr; const APInt *CV = &C->getAPIntValue(); return CV->isPowerOf2() ? CV : nullptr; } SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { // If we have a CMOV, OR and AND combination such as: // if (x & CN) // y |= CM; // // And: // * CN is a single bit; // * All bits covered by CM are known zero in y // // Then we can convert this into a sequence of BFI instructions. This will // always be a win if CM is a single bit, will always be no worse than the // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is // three bits (due to the extra IT instruction). SDValue Op0 = CMOV->getOperand(0); SDValue Op1 = CMOV->getOperand(1); auto CCNode = cast(CMOV->getOperand(2)); auto CC = CCNode->getAPIntValue().getLimitedValue(); SDValue CmpZ = CMOV->getOperand(4); // The compare must be against zero. if (!isNullConstant(CmpZ->getOperand(1))) return SDValue(); assert(CmpZ->getOpcode() == ARMISD::CMPZ); SDValue And = CmpZ->getOperand(0); if (And->getOpcode() != ISD::AND) return SDValue(); const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); if (!AndC) return SDValue(); SDValue X = And->getOperand(0); if (CC == ARMCC::EQ) { // We're performing an "equal to zero" compare. Swap the operands so we // canonicalize on a "not equal to zero" compare. std::swap(Op0, Op1); } else { assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } if (Op1->getOpcode() != ISD::OR) return SDValue(); ConstantSDNode *OrC = dyn_cast(Op1->getOperand(1)); if (!OrC) return SDValue(); SDValue Y = Op1->getOperand(0); if (Op0 != Y) return SDValue(); // Now, is it profitable to continue? APInt OrCI = OrC->getAPIntValue(); unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; if (OrCI.countPopulation() > Heuristic) return SDValue(); // Lastly, can we determine that the bits defined by OrCI // are zero in Y? KnownBits Known = DAG.computeKnownBits(Y); if ((OrCI & Known.Zero) != OrCI) return SDValue(); // OK, we can do the combine. SDValue V = Y; SDLoc dl(X); EVT VT = X.getValueType(); unsigned BitInX = AndC->logBase2(); if (BitInX != 0) { // We must shift X first. X = DAG.getNode(ISD::SRL, dl, VT, X, DAG.getConstant(BitInX, dl, VT)); } for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); BitInY < NumActiveBits; ++BitInY) { if (OrCI[BitInY] == 0) continue; APInt Mask(VT.getSizeInBits(), 0); Mask.setBit(BitInY); V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, // Confusingly, the operand is an *inverted* mask. DAG.getConstant(~Mask, dl, VT)); } return V; } // Given N, the value controlling the conditional branch, search for the loop // intrinsic, returning it, along with how the value is used. We need to handle // patterns such as the following: // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) // (brcond (setcc (loop.decrement), 0, eq), exit) // (brcond (setcc (loop.decrement), 0, ne), header) static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate) { switch (N->getOpcode()) { default: break; case ISD::XOR: { if (!isa(N.getOperand(1))) return SDValue(); if (!cast(N.getOperand(1))->isOne()) return SDValue(); Negate = !Negate; return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); } case ISD::SETCC: { auto *Const = dyn_cast(N.getOperand(1)); if (!Const) return SDValue(); if (Const->isNullValue()) Imm = 0; else if (Const->isOne()) Imm = 1; else return SDValue(); CC = cast(N.getOperand(2))->get(); return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); } case ISD::INTRINSIC_W_CHAIN: { unsigned IntOp = cast(N.getOperand(1))->getZExtValue(); if (IntOp != Intrinsic::test_set_loop_iterations && IntOp != Intrinsic::loop_decrement_reg) return SDValue(); return N; } } return SDValue(); } static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { // The hwloop intrinsics that we're interested are used for control-flow, // either for entering or exiting the loop: // - test.set.loop.iterations will test whether its operand is zero. If it // is zero, the proceeding branch should not enter the loop. // - loop.decrement.reg also tests whether its operand is zero. If it is // zero, the proceeding branch should not branch back to the beginning of // the loop. // So here, we need to check that how the brcond is using the result of each // of the intrinsics to ensure that we're branching to the right place at the // right time. ISD::CondCode CC; SDValue Cond; int Imm = 1; bool Negate = false; SDValue Chain = N->getOperand(0); SDValue Dest; if (N->getOpcode() == ISD::BRCOND) { CC = ISD::SETEQ; Cond = N->getOperand(1); Dest = N->getOperand(2); } else { assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); CC = cast(N->getOperand(1))->get(); Cond = N->getOperand(2); Dest = N->getOperand(4); if (auto *Const = dyn_cast(N->getOperand(3))) { if (!Const->isOne() && !Const->isNullValue()) return SDValue(); Imm = Const->getZExtValue(); } else return SDValue(); } SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); if (!Int) return SDValue(); if (Negate) CC = ISD::getSetCCInverse(CC, true); auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { return (CC == ISD::SETEQ && Imm == 0) || (CC == ISD::SETNE && Imm == 1) || (CC == ISD::SETLT && Imm == 1) || (CC == ISD::SETULT && Imm == 1); }; auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { return (CC == ISD::SETEQ && Imm == 1) || (CC == ISD::SETNE && Imm == 0) || (CC == ISD::SETGT && Imm == 0) || (CC == ISD::SETUGT && Imm == 0) || (CC == ISD::SETGE && Imm == 1) || (CC == ISD::SETUGE && Imm == 1); }; assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && "unsupported condition"); SDLoc dl(Int); SelectionDAG &DAG = DCI.DAG; SDValue Elements = Int.getOperand(2); unsigned IntOp = cast(Int->getOperand(1))->getZExtValue(); assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) && "expected single br user"); SDNode *Br = *N->use_begin(); SDValue OtherTarget = Br->getOperand(1); // Update the unconditional branch to branch to the given Dest. auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { SDValue NewBrOps[] = { Br->getOperand(0), Dest }; SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); }; if (IntOp == Intrinsic::test_set_loop_iterations) { SDValue Res; // We expect this 'instruction' to branch when the counter is zero. if (IsTrueIfZero(CC, Imm)) { SDValue Ops[] = { Chain, Elements, Dest }; Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); } else { // The logic is the reverse of what we need for WLS, so find the other // basic block target: the target of the proceeding br. UpdateUncondBr(Br, Dest, DAG); SDValue Ops[] = { Chain, Elements, OtherTarget }; Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); } DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); return Res; } else { SDValue Size = DAG.getTargetConstant( cast(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); SDValue Args[] = { Int.getOperand(0), Elements, Size, }; SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, DAG.getVTList(MVT::i32, MVT::Other), Args); DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); // We expect this instruction to branch when the count is not zero. SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; // Update the unconditional branch to target the loop preheader if we've // found the condition has been reversed. if (Target == OtherTarget) UpdateUncondBr(Br, Dest, DAG); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SDValue(LoopDec.getNode(), 1), Chain); SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); } return SDValue(); } /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. SDValue ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { SDValue Cmp = N->getOperand(4); if (Cmp.getOpcode() != ARMISD::CMPZ) // Only looking at NE cases. return SDValue(); EVT VT = N->getValueType(0); SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue Chain = N->getOperand(0); SDValue BB = N->getOperand(1); SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) // -> (brcond Chain BB CC CPSR Cmp) if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && LHS->getOperand(0)->hasOneUse()) { auto *LHS00C = dyn_cast(LHS->getOperand(0)->getOperand(0)); auto *LHS01C = dyn_cast(LHS->getOperand(0)->getOperand(1)); auto *LHS1C = dyn_cast(LHS->getOperand(1)); auto *RHSC = dyn_cast(RHS); if ((LHS00C && LHS00C->getZExtValue() == 0) && (LHS01C && LHS01C->getZExtValue() == 1) && (LHS1C && LHS1C->getZExtValue() == 1) && (RHSC && RHSC->getZExtValue() == 0)) { return DAG.getNode( ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); } } return SDValue(); } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. SDValue ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { SDValue Cmp = N->getOperand(4); if (Cmp.getOpcode() != ARMISD::CMPZ) // Only looking at EQ and NE cases. return SDValue(); EVT VT = N->getValueType(0); SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue FalseVal = N->getOperand(0); SDValue TrueVal = N->getOperand(1); SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); // BFI is only available on V6T2+. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { SDValue R = PerformCMOVToBFICombine(N, DAG); if (R) return R; } // Simplify // mov r1, r0 // cmp r1, x // mov r0, y // moveq r0, x // to // cmp r0, x // movne r0, y // // mov r1, r0 // cmp r1, x // mov r0, x // movne r0, y // to // cmp r0, x // movne r0, y /// FIXME: Turn this into a target neutral optimization? SDValue Res; if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, N->getOperand(3), Cmp); } else if (CC == ARMCC::EQ && TrueVal == RHS) { SDValue ARMcc; SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, N->getOperand(3), NewCmp); } // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) // -> (cmov F T CC CPSR Cmp) if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { auto *LHS0C = dyn_cast(LHS->getOperand(0)); auto *LHS1C = dyn_cast(LHS->getOperand(1)); auto *RHSC = dyn_cast(RHS); if ((LHS0C && LHS0C->getZExtValue() == 0) && (LHS1C && LHS1C->getZExtValue() == 1) && (RHSC && RHSC->getZExtValue() == 0)) { return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, LHS->getOperand(2), LHS->getOperand(3), LHS->getOperand(4)); } } if (!VT.isInteger()) return SDValue(); // Materialize a boolean comparison for integers so we can avoid branching. if (isNullConstant(FalseVal)) { if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it // right 5 bits will make that 32 be 1, otherwise it will be 0. // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), DAG.getConstant(5, dl, MVT::i32)); } else { // CMOV 0, 1, ==, (CMPZ x, y) -> // (ADDCARRY (SUB x, y), t:0, t:1) // where t = (SUBCARRY 0, (SUB x, y), 0) // // The SUBCARRY computes 0 - (x - y) and this will give a borrow when // x != y. In other words, a carry C == 1 when x == y, C == 0 // otherwise. // The final ADDCARRY computes // x - y + (0 - (x - y)) + C == C SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); // ISD::SUBCARRY returns a borrow but we want the carry here // actually. SDValue Carry = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); } } else if (CC == ARMCC::NE && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { // This seems pointless but will allow us to combine it further below. // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 SDValue Sub = DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, N->getOperand(3), CPSRGlue.getValue(1)); FalseVal = Sub; } } else if (isNullConstant(TrueVal)) { if (CC == ARMCC::EQ && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { // This seems pointless but will allow us to combine it further below // Note that we change == for != as this is the dual for the case above. // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 SDValue Sub = DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, DAG.getConstant(ARMCC::NE, dl, MVT::i32), N->getOperand(3), CPSRGlue.getValue(1)); FalseVal = Sub; } } // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> // t1 = (USUBO (SUB x, y), 1) // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) // Result = if K != 0 then (SHL t2:0, K) else t2:0 // // This also handles the special case of comparing against zero; it's // essentially, the same pattern, except there's no SUBS: // CMOV x, z, !=, (CMPZ x, 0) -> // t1 = (USUBO x, 1) // t2 = (SUBCARRY x, t1:0, t1:1) // Result = if K != 0 then (SHL t2:0, K) else t2:0 const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && ((FalseVal.getOpcode() == ARMISD::SUBS && FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || (FalseVal == LHS && isNullConstant(RHS))) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned ShiftAmount = TrueConst->logBase2(); if (ShiftAmount) TrueVal = DAG.getConstant(1, dl, VT); SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); if (ShiftAmount) Res = DAG.getNode(ISD::SHL, dl, VT, Res, DAG.getConstant(ShiftAmount, dl, MVT::i32)); } if (Res.getNode()) { KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); // Capture demanded bits information that would be otherwise lost. if (Known.Zero == 0xfffffffe) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i1)); else if (Known.Zero == 0xffffff00) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i8)); else if (Known.Zero == 0xffff0000) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i16)); } return Res; } SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); case ISD::BRCOND: case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); case ISD::STORE: return PerformSTORECombine(N, DCI); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD1DUP: case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) return SDValue(); break; } case ARMISD::SMULWT: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) return SDValue(); break; } case ARMISD::SMLALBB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) return SDValue(); break; } case ARMISD::SMLALBT: { unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) return SDValue(); break; } case ARMISD::SMLALTB: { unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) return SDValue(); break; } case ARMISD::SMLALTT: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) return SDValue(); break; } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld1x2: case Intrinsic::arm_neon_vld1x3: case Intrinsic::arm_neon_vld1x4: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst1x2: case Intrinsic::arm_neon_vst1x3: case Intrinsic::arm_neon_vst1x4: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); default: break; } break; } return SDValue(); } bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const { return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned Alignment, MachineMemOperand::Flags, bool *Fast) const { // Depends what it gets converted into if the type is weird. if (!VT.isSimple()) return false; // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); auto Ty = VT.getSimpleVT().SimpleTy; if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { // Unaligned access can use (for example) LRDB, LRDH, LDR if (AllowsUnaligned) { if (Fast) *Fast = Subtarget->hasV7Ops(); return true; } } if (Ty == MVT::f64 || Ty == MVT::v2f64) { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) *Fast = true; return true; } } if (!Subtarget->hasMVEIntegerOps()) return false; // These are for predicates if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { if (Fast) *Fast = true; return true; } // These are for truncated stores/narrowing loads. They are fine so long as // the alignment is at least the size of the item being loaded if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && Alignment >= VT.getScalarSizeInBits() / 8) { if (Fast) *Fast = true; return true; } // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and // VSTRW.U32 all store the vector register in exactly the same format, and // differ only in the range of their immediate offset field and the required // alignment. So there is always a store that can be used, regardless of // actual type. // // For big endian, that is not the case. But can still emit a (VSTRB.U8; // VREV64.8) pair and get the same effect. This will likely be better than // aligning the vector through the stack. if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || Ty == MVT::v2f64) { if (Fast) *Fast = true; return true; } return false; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && (DstAlign == 0 || DstAlign % AlignCheck == 0)); } EVT ARMTargetLowering::getOptimalMemOpType( uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || (allowsMisalignedMemoryAccesses( MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::f64; } } // Let the target-independent logic figure it out. return MVT::Other; } // 64-bit integers are split into their high and low parts and held in two // different registers, so the trunc is free since the low register can just // be used. bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) return false; unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); unsigned DestBits = DstTy->getPrimitiveSizeInBits(); return (SrcBits == 64 && DestBits == 32); } bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || !DstVT.isInteger()) return false; unsigned SrcBits = SrcVT.getSizeInBits(); unsigned DestBits = DstVT.getSizeInBits(); return (SrcBits == 64 && DestBits == 32); } bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { if (Val.getOpcode() != ISD::LOAD) return false; EVT VT1 = Val.getValueType(); if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i1: case MVT::i8: case MVT::i16: // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. return true; } return false; } bool ARMTargetLowering::isFNegFree(EVT VT) const { if (!VT.isSimple()) return false; // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that // negate values directly (fneg is free). So, we don't want to let the DAG // combiner rewrite fneg into xors and some other instructions. For f16 and // FullFP16 argument passing, some bitcast nodes may be introduced, // triggering this DAG combine rewrite, so we are avoiding that with this. switch (VT.getSimpleVT().SimpleTy) { default: break; case MVT::f16: return Subtarget->hasFullFP16(); } return false; } /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth /// of the vector elements. static bool areExtractExts(Value *Ext1, Value *Ext2) { auto areExtDoubled = [](Instruction *Ext) { return Ext->getType()->getScalarSizeInBits() == 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); }; if (!match(Ext1, m_ZExtOrSExt(m_Value())) || !match(Ext2, m_ZExtOrSExt(m_Value())) || !areExtDoubled(cast(Ext1)) || !areExtDoubled(cast(Ext2))) return false; return true; } /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// sext/zext can be folded into vsubl. bool ARMTargetLowering::shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const { if (!Subtarget->hasNEON() || !I->getType()->isVectorTy()) return false; switch (I->getOpcode()) { case Instruction::Sub: case Instruction::Add: { if (!areExtractExts(I->getOperand(0), I->getOperand(1))) return false; Ops.push_back(&I->getOperandUse(0)); Ops.push_back(&I->getOperandUse(1)); return true; } default: return false; } return false; } bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); if (!isTypeLegal(VT)) return false; // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no // matter what. There can be two uses by the same instruction. if (ExtVal->use_empty() || !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) return true; SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) return false; return true; } bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { if (isLegalAddressingMode(DL, AM, Ty, AS)) { if (Subtarget->hasFPAO()) return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster return 0; } return -1; } static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; unsigned Scale = 1; switch (VT.getSimpleVT().SimpleTy) { case MVT::i1: case MVT::i8: // Scale == 1; break; case MVT::i16: // Scale == 2; Scale = 2; break; default: // On thumb1 we load most things (i32, i64, floats, etc) with a LDR // Scale == 4; Scale = 4; break; } if ((V & (Scale - 1)) != 0) return false; return isUInt<5>(V / Scale); } static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { if (!VT.isInteger() && !VT.isFloatingPoint()) return false; if (VT.isVector() && Subtarget->hasNEON()) return false; if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && !Subtarget->hasMVEFloatOps()) return false; bool IsNeg = false; if (V < 0) { IsNeg = true; V = -V; } unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U); // MVE: size * imm7 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { case MVT::i32: case MVT::f32: return isShiftedUInt<7,2>(V); case MVT::i16: case MVT::f16: return isShiftedUInt<7,1>(V); case MVT::i8: return isUInt<7>(V); default: return false; } } // half VLDR: 2 * imm8 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) return isShiftedUInt<8, 1>(V); // VLDR and LDRD: 4 * imm8 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) return isShiftedUInt<8, 2>(V); if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { // + imm12 or - imm8 if (IsNeg) return isUInt<8>(V); return isUInt<12>(V); } return false; } /// isLegalAddressImmediate - Return true if the integer value can be used /// as the offset of the target addressing mode for load / store of the /// given type. static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { if (V == 0) return true; if (!VT.isSimple()) return false; if (Subtarget->isThumb1Only()) return isLegalT1AddressImmediate(V, VT); else if (Subtarget->isThumb2()) return isLegalT2AddressImmediate(V, VT, Subtarget); // ARM mode. if (V < 0) V = - V; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: // +- imm12 return isUInt<12>(V); case MVT::i16: // +- imm8 return isUInt<8>(V); case MVT::f32: case MVT::f64: if (!Subtarget->hasVFP2Base()) // FIXME: NEON? return false; return isShiftedUInt<8, 2>(V); } } bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const { int Scale = AM.Scale; if (Scale < 0) return false; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i16: case MVT::i32: if (Scale == 1) return true; // r + r << imm Scale = Scale & ~1; return Scale == 2 || Scale == 4 || Scale == 8; case MVT::i64: // FIXME: What are we trying to model here? ldrd doesn't have an r + r // version in Thumb mode. // r + r if (Scale == 1) return true; // r * 2 (this can be lowered to r + r). if (!AM.HasBaseReg && Scale == 2) return true; return false; case MVT::isVoid: // Note, we allow "void" uses (basically, uses that aren't loads or // stores), because arm allows folding a scale into many arithmetic // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. if (Scale & 1) return false; return isPowerOf2_32(Scale); } } bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const { const int Scale = AM.Scale; // Negative scales are not supported in Thumb1. if (Scale < 0) return false; // Thumb1 addressing modes do not support register scaling excepting the // following cases: // 1. Scale == 1 means no scaling. // 2. Scale == 2 this can be lowered to r + r if there is no base register. return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { EVT VT = getValueType(DL, Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; // Can never fold addr of global into load/store. if (AM.BaseGV) return false; switch (AM.Scale) { case 0: // no scale reg, must be "r+i" or "r", or "i". break; default: // ARM doesn't support any R+R*scale+imm addr modes. if (AM.BaseOffs) return false; if (!VT.isSimple()) return false; if (Subtarget->isThumb1Only()) return isLegalT1ScaledAddressingMode(AM, VT); if (Subtarget->isThumb2()) return isLegalT2ScaledAddressingMode(AM, VT); int Scale = AM.Scale; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: if (Scale < 0) Scale = -Scale; if (Scale == 1) return true; // r + r << imm return isPowerOf2_32(Scale & ~1); case MVT::i16: case MVT::i64: // r +/- r if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) return true; // r * 2 (this can be lowered to r + r). if (!AM.HasBaseReg && Scale == 2) return true; return false; case MVT::isVoid: // Note, we allow "void" uses (basically, uses that aren't loads or // stores), because arm allows folding a scale into many arithmetic // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. if (Scale & 1) return false; return isPowerOf2_32(Scale); } } return true; } /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // Thumb2 and ARM modes can use cmn for negative immediates. if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; // Thumb1 doesn't have cmn, and only 8-bit immediates. return Imm >= 0 && Imm <= 255; } /// isLegalAddImmediate - Return true if the specified immediate is a legal add /// *or sub* immediate, that is the target has add or sub instructions which can /// add a register with the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { // Same encoding for add/sub, just flip the sign. int64_t AbsImm = std::abs(Imm); if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(AbsImm) != -1; if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal(AbsImm) != -1; // Thumb1 only has 8-bit unsigned immediate. return AbsImm >= 0 && AbsImm <= 255; } static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { // AddressingMode 3 Base = Ptr->getOperand(0); if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -256) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } isInc = (Ptr->getOpcode() == ISD::ADD); Offset = Ptr->getOperand(1); return true; } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { // AddressingMode 2 if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -0x1000) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); Base = Ptr->getOperand(0); return true; } } if (Ptr->getOpcode() == ISD::ADD) { isInc = true; ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); if (ShOpcVal != ARM_AM::no_shift) { Base = Ptr->getOperand(1); Offset = Ptr->getOperand(0); } else { Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); } return true; } isInc = (Ptr->getOpcode() == ISD::ADD); Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); return true; } // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. return false; } static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; Base = Ptr->getOperand(0); if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -0x100) { // 8 bits. assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. isInc = Ptr->getOpcode() == ISD::ADD; Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } return false; } static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, bool isSEXTLoad, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; if (!isa(Ptr->getOperand(1))) return false; ConstantSDNode *RHS = cast(Ptr->getOperand(1)); int RHSC = (int)RHS->getZExtValue(); auto IsInRange = [&](int RHSC, int Limit, int Scale) { if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { isInc = Ptr->getOpcode() == ISD::ADD; Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } return false; }; // Try to find a matching instruction based on s/zext, Alignment, Offset and // (in BE) type. Base = Ptr->getOperand(0); if (VT == MVT::v4i16) { if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) return true; } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { if (IsInRange(RHSC, 0x80, 1)) return true; } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) && IsInRange(RHSC, 0x80, 4)) return true; else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) && IsInRange(RHSC, 0x80, 2)) return true; else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) return true; return false; } /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { if (Subtarget->isThumb1Only()) return false; EVT VT; SDValue Ptr; unsigned Align; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); Align = ST->getAlignment(); } else return false; bool isInc; bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); else isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); } if (!isLegal) return false; AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; return true; } /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; unsigned Align; bool isSEXTLoad = false, isNonExt; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); } else return false; if (Subtarget->isThumb1Only()) { // Thumb-1 can do a limited post-inc load or store as an updating LDM. It // must be non-extending/truncating, i32, with an offset of 4. assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); if (Op->getOpcode() != ISD::ADD || !isNonExt) return false; auto *RHS = dyn_cast(Op->getOperand(1)); if (!RHS || RHS->getZExtValue() != 4) return false; Offset = Op->getOperand(1); Base = Op->getOperand(0); AM = ISD::POST_INC; return true; } bool isInc; bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); else isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); } if (!isLegal) return false; if (Ptr != Base) { // Swap base ptr and offset to catch more post-index load / store when // it's legal. In Thumb2 mode, offset must be an immediate. if (Ptr == Offset && Op->getOpcode() == ISD::ADD && !Subtarget->isThumb2()) std::swap(Base, Offset); // Post-indexed load / store update the base pointer. if (Ptr != Base) return false; } AM = isInc ? ISD::POST_INC : ISD::POST_DEC; return true; } void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = Known.getBitWidth(); Known.resetAll(); switch (Op.getOpcode()) { default: break; case ARMISD::ADDC: case ARMISD::ADDE: case ARMISD::SUBC: case ARMISD::SUBE: // Special cases when we convert a carry to a boolean. if (Op.getResNo() == 0) { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); // (ADDE 0, 0, C) will give us a single bit. if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && isNullConstant(RHS)) { Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); return; } } break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); if (Known.isUnknown()) return; KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); Known.Zero &= KnownRHS.Zero; Known.One &= KnownRHS.One; return; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } } case ARMISD::BFI: { // Conservatively, we can recurse down the first operand // and just mask out all affected bits. Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); // The operand to BFI is already a mask suitable for removing the bits it // sets. ConstantSDNode *CI = cast(Op.getOperand(2)); const APInt &Mask = CI->getAPIntValue(); Known.Zero &= Mask; Known.One &= Mask; return; } case ARMISD::VGETLANEs: case ARMISD::VGETLANEu: { const SDValue &SrcSV = Op.getOperand(0); EVT VecVT = SrcSV.getValueType(); assert(VecVT.isVector() && "VGETLANE expected a vector type"); const unsigned NumSrcElts = VecVT.getVectorNumElements(); ConstantSDNode *Pos = cast(Op.getOperand(1).getNode()); assert(Pos->getAPIntValue().ult(NumSrcElts) && "VGETLANE index out of bounds"); unsigned Idx = Pos->getZExtValue(); APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); EVT VT = Op.getValueType(); const unsigned DstSz = VT.getScalarSizeInBits(); const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); (void)SrcSz; assert(SrcSz == Known.getBitWidth()); assert(DstSz > SrcSz); if (Op.getOpcode() == ARMISD::VGETLANEs) Known = Known.sext(DstSz); else { Known = Known.zext(DstSz, true /* extended bits are known zero */); } assert(DstSz == Known.getBitWidth()); break; } } } bool ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedAPInt, TargetLoweringOpt &TLO) const { // Delay optimization, so we don't have to deal with illegal types, or block // optimizations. if (!TLO.LegalOps) return false; // Only optimize AND for now. if (Op.getOpcode() != ISD::AND) return false; EVT VT = Op.getValueType(); // Ignore vectors. if (VT.isVector()) return false; assert(VT == MVT::i32 && "Unexpected integer type"); // Make sure the RHS really is a constant. ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; unsigned Mask = C->getZExtValue(); unsigned Demanded = DemandedAPInt.getZExtValue(); unsigned ShrunkMask = Mask & Demanded; unsigned ExpandedMask = Mask | ~Demanded; // If the mask is all zeros, let the target-independent code replace the // result with zero. if (ShrunkMask == 0) return false; // If the mask is all ones, erase the AND. (Currently, the target-independent // code won't do this, so we have to do it explicitly to avoid an infinite // loop in obscure cases.) if (ExpandedMask == ~0U) return TLO.CombineTo(Op, Op.getOperand(0)); auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; }; auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { if (NewMask == Mask) return true; SDLoc DL(Op); SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); return TLO.CombineTo(Op, NewOp); }; // Prefer uxtb mask. if (IsLegalMask(0xFF)) return UseMask(0xFF); // Prefer uxth mask. if (IsLegalMask(0xFFFF)) return UseMask(0xFFFF); // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. // FIXME: Prefer a contiguous sequence of bits for other optimizations. if (ShrunkMask < 256) return UseMask(ShrunkMask); // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. // FIXME: Prefer a contiguous sequence of bits for other optimizations. if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) return UseMask(ExpandedMask); // Potential improvements: // // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. // We could try to prefer Thumb1 immediates which can be lowered to a // two-instruction sequence. // We could try to recognize more legal ARM/Thumb2 immediates here. return false; } //===----------------------------------------------------------------------===// // ARM Inline Assembly Support //===----------------------------------------------------------------------===// bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { // Looking for "rev" which is V6+. if (!Subtarget->hasV6Ops()) return false; InlineAsm *IA = cast(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: AsmStr = AsmPieces[0]; AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t,"); // rev $0, $1 if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && IA->getConstraintString().compare(0, 4, "=l,l") == 0) { IntegerType *Ty = dyn_cast(CI->getType()); if (Ty && Ty->getBitWidth() == 32) return IntrinsicLowering::LowerToByteSwap(CI); } break; } return false; } const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { // At this point, we have to lower this constraint to something else, so we // lower it to an "r" or "w". However, by doing this we will force the result // to be in register, while the X constraint is much more permissive. // // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. if (!Subtarget->hasVFP2Base()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; if (ConstraintVT.isVector() && Subtarget->hasNEON() && (ConstraintVT.getSizeInBits() == 64 || ConstraintVT.getSizeInBits() == 128)) return "w"; return "r"; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType ARMTargetLowering::getConstraintType(StringRef Constraint) const { unsigned S = Constraint.size(); if (S == 1) { switch (Constraint[0]) { default: break; case 'l': return C_RegisterClass; case 'w': return C_RegisterClass; case 'h': return C_RegisterClass; case 'x': return C_RegisterClass; case 't': return C_RegisterClass; case 'j': return C_Immediate; // Constant for movw. // An address with a single base register. Due to the way we // currently handle addresses it is the same as an 'r' memory constraint. case 'Q': return C_Memory; } } else if (S == 2) { switch (Constraint[0]) { default: break; case 'T': return C_RegisterClass; // All 'U+' constraints are addresses. case 'U': return C_Memory; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight ARMTargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'l': if (type->isIntegerTy()) { if (Subtarget->isThumb()) weight = CW_SpecificReg; else weight = CW_Register; } break; case 'w': if (type->isFloatingPointTy()) weight = CW_Register; break; } return weight; } using RCPair = std::pair; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { switch (Constraint.size()) { case 1: // GCC ARM Constraint Letters switch (Constraint[0]) { case 'l': // Low regs or general regs. if (Subtarget->isThumb()) return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'h': // High regs or no regs. if (Subtarget->isThumb()) return RCPair(0U, &ARM::hGPRRegClass); break; case 'r': if (Subtarget->isThumb1Only()) return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'w': if (VT == MVT::Other) break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPRRegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPRRegClass); break; case 'x': if (VT == MVT::Other) break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPR_8RegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPR_8RegClass); break; case 't': if (VT == MVT::Other) break; if (VT == MVT::f32 || VT == MVT::i32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPR_VFP2RegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPR_VFP2RegClass); break; } break; case 2: if (Constraint[0] == 'T') { switch (Constraint[1]) { default: break; case 'e': return RCPair(0U, &ARM::tGPREvenRegClass); case 'o': return RCPair(0U, &ARM::tGPROddRegClass); } } break; default: break; } if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'j': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': ConstantSDNode *C = dyn_cast(Op); if (!C) return; int64_t CVal64 = C->getSExtValue(); int CVal = (int) CVal64; // None of these constraints allow values larger than 32 bits. Check // that the value fits in an int. if (CVal != CVal64) return; switch (ConstraintLetter) { case 'j': // Constant suitable for movw, must be between 0 and // 65535. if (Subtarget->hasV6T2Ops()) if (CVal >= 0 && CVal <= 65535) break; return; case 'I': if (Subtarget->isThumb1Only()) { // This must be a constant between 0 and 255, for ADD // immediates. if (CVal >= 0 && CVal <= 255) break; } else if (Subtarget->isThumb2()) { // A constant that can be used as an immediate value in a // data-processing instruction. if (ARM_AM::getT2SOImmVal(CVal) != -1) break; } else { // A constant that can be used as an immediate value in a // data-processing instruction. if (ARM_AM::getSOImmVal(CVal) != -1) break; } return; case 'J': if (Subtarget->isThumb1Only()) { // This must be a constant between -255 and -1, for negated ADD // immediates. This can be used in GCC with an "n" modifier that // prints the negated value, for use with SUB instructions. It is // not useful otherwise but is implemented for compatibility. if (CVal >= -255 && CVal <= -1) break; } else { // This must be a constant between -4095 and 4095. It is not clear // what this constraint is intended for. Implemented for // compatibility with GCC. if (CVal >= -4095 && CVal <= 4095) break; } return; case 'K': if (Subtarget->isThumb1Only()) { // A 32-bit value where only one byte has a nonzero value. Exclude // zero to match GCC. This constraint is used by GCC internally for // constants that can be loaded with a move/shift combination. // It is not useful otherwise but is implemented for compatibility. if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) break; } else if (Subtarget->isThumb2()) { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC // with a "B" modifier that prints the inverted value, for use with // BIC and MVN instructions. It is not useful otherwise but is // implemented for compatibility. if (ARM_AM::getT2SOImmVal(~CVal) != -1) break; } else { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC // with a "B" modifier that prints the inverted value, for use with // BIC and MVN instructions. It is not useful otherwise but is // implemented for compatibility. if (ARM_AM::getSOImmVal(~CVal) != -1) break; } return; case 'L': if (Subtarget->isThumb1Only()) { // This must be a constant between -7 and 7, // for 3-operand ADD/SUB immediate instructions. if (CVal >= -7 && CVal < 7) break; } else if (Subtarget->isThumb2()) { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" // modifier that prints the negated value, for use with SUB // instructions. It is not useful otherwise but is implemented for // compatibility. if (ARM_AM::getT2SOImmVal(-CVal) != -1) break; } else { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" // modifier that prints the negated value, for use with SUB // instructions. It is not useful otherwise but is implemented for // compatibility. if (ARM_AM::getSOImmVal(-CVal) != -1) break; } return; case 'M': if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between 0 and 1020, for // ADD sp + immediate. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) break; } else { // A power of two or a constant between 0 and 32. This is used in // GCC for the shift amount on shifted register operands, but it is // useful in general for any shift amounts. if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) break; } return; case 'N': if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a constant between 0 and 31, for shift amounts. if (CVal >= 0 && CVal <= 31) break; } return; case 'O': if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a multiple of 4 between -508 and 508, for // ADD/SUB sp = sp + immediate. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) break; } return; } Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } static RTLIB::Libcall getDivRemLibcall( const SDNode *N, MVT::SimpleValueType SVT) { assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && "Unhandled Opcode in getDivRemLibcall"); bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; RTLIB::Libcall LC; switch (SVT) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; } return LC; } static TargetLowering::ArgListTy getDivRemArgList( const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && "Unhandled Opcode in getDivRemArgList"); bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { EVT ArgVT = N->getOperand(i).getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*Context); Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.IsSExt = isSigned; Entry.IsZExt = !isSigned; Args.push_back(Entry); } if (Subtarget->isTargetWindows() && Args.size() >= 2) std::swap(Args[0], Args[1]); return Args; } SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || Subtarget->isTargetWindows()) && "Register-based DivRem lowering only"); unsigned Opcode = Op->getOpcode(); assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && "Invalid opcode for Div/Rem lowering"); bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); SDLoc dl(Op); // If the target has hardware divide, use divide + multiply + subtract: // div = a / b // rem = a - b * div // return {div, rem} // This should be lowered into UDIV/SDIV + MLS later on. bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); if (hasDivide && Op->getValueType(0).isSimple() && Op->getSimpleValueType(0) == MVT::i32) { unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; const SDValue Dividend = Op->getOperand(0); const SDValue Divisor = Op->getOperand(1); SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); SDValue Values[2] = {Div, Rem}; return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); } RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), VT.getSimpleVT().SimpleTy); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), DAG.getContext(), Subtarget); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); Type *RetTy = StructType::get(Ty, Ty); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return CallInfo.first; } // Lowers REM using divmod helpers // see RTABI section 4.2/4.3 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { // Build return types (div and rem) std::vector RetTyParams; Type *RetTyElement; switch (N->getValueType(0).getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; } RetTyParams.push_back(RetTyElement); RetTyParams.push_back(RetTyElement); ArrayRef ret = ArrayRef(RetTyParams); Type *RetTy = StructType::get(*DAG.getContext(), ret); RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). SimpleTy); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), Subtarget); bool isSigned = N->getOpcode() == ISD::SREM; SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, N, InChain); // Lower call CallLoweringInfo CLI(DAG); CLI.setChain(InChain) .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); std::pair CallResult = LowerCallTo(CLI); // Return second (rem) result operand (first contains div) SDNode *ResNode = CallResult.first.getNode(); assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); return ResNode->getOperand(1); } SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); SDLoc DL(Op); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { unsigned Align = cast(Op.getOperand(2))->getZExtValue(); SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); } SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, DAG.getConstant(2, DL, MVT::i32)); SDValue Flag; Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = NewSP.getValue(1); SDValue Ops[2] = { NewSP, Chain }; return DAG.getMergeValues(Ops, DL); } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDValue SrcVal = Op.getOperand(0); const unsigned DstSz = Op.getValueType().getSizeInBits(); const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && "Unexpected type for custom-lowering FP_EXTEND"); assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && "With both FP DP and 16, any FP conversion is legal!"); assert(!(DstSz == 32 && Subtarget->hasFP16()) && "With FP16, 16 to 32 conversion is legal!"); // Either we are converting from 16 -> 64, without FP16 and/or // FP.double-precision or without Armv8-fp. So we must do it in two // steps. // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 // without FP16. So we must do a function call. SDLoc Loc(Op); RTLIB::Libcall LC; if (SrcSz == 16) { // Instruction from 16 -> 32 if (Subtarget->hasFP16()) SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal); // Lib call from 16 -> 32 else { LC = RTLIB::getFPEXT(MVT::f16, MVT::f32); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); SrcVal = makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first; } } if (DstSz != 64) return SrcVal; // For sure now SrcVal is 32 bits if (Subtarget->hasFP64()) // Instruction from 32 -> 64 return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal); LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SrcVal = Op.getOperand(0); EVT SrcVT = SrcVal.getValueType(); EVT DstVT = Op.getValueType(); const unsigned DstSz = Op.getValueType().getSizeInBits(); const unsigned SrcSz = SrcVT.getSizeInBits(); (void)DstSz; assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && "Unexpected type for custom-lowering FP_ROUND"); assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && "With both FP DP and 16, any FP conversion is legal!"); SDLoc Loc(Op); // Instruction from 32 -> 16 if hasFP16 is valid if (SrcSz == 32 && Subtarget->hasFP16()) return Op; // Lib call from 32 -> 16 / 64 -> [32, 16] RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_ROUND"); return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first; } void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); MVT HalfT = MVT::i32; SDLoc dl(N); SDValue Hi, Lo, Tmp; if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || !isOperationLegalOrCustom(ISD::UADDO, HalfT)) return ; unsigned OpTypeBits = HalfT.getScalarSizeInBits(); SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), DAG.getConstant(0, dl, HalfT)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), DAG.getConstant(1, dl, HalfT)); Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, DAG.getConstant(OpTypeBits - 1, dl, getShiftAmountTy(HalfT, DAG.getDataLayout()))); Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, SDValue(Lo.getNode(), 1)); Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); Results.push_back(Lo); Results.push_back(Hi); } bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. return false; } bool ARM::isBitFieldInvertedMask(unsigned v) { if (v == 0xffffffff) return false; // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's return isShiftedMask_32(~v); } /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { if (!Subtarget->hasVFP3Base()) return false; if (VT == MVT::f16 && Subtarget->hasFullFP16()) return ARM_AM::getFP16Imm(Imm) != -1; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; if (VT == MVT::f64 && Subtarget->hasFP64()) return ARM_AM::getFP64Imm(Imm) != -1; return false; } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); Info.align = MaybeAlign(cast(AlignArg)->getZExtValue()); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::arm_neon_vld1x2: case Intrinsic::arm_neon_vld1x3: case Intrinsic::arm_neon_vld1x4: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); Info.align = MaybeAlign(cast(AlignArg)->getZExtValue()); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::arm_neon_vst1x2: case Intrinsic::arm_neon_vst1x3: case Intrinsic::arm_neon_vst1x4: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::arm_stlexd: case Intrinsic::arm_strexd: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = Align(8); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; case Intrinsic::arm_ldaexd: case Intrinsic::arm_ldrexd: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = Align(8); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; default: break; } return false; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned Bits = Ty->getPrimitiveSizeInBits(); if (Bits == 0 || Bits > 32) return false; return true; } bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); // First, if the target has no DMB, see what fallback we can use. if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; return Builder.CreateCall(MCR, args); } else { // Instead of using barriers, atomic accesses on these subtargets use // libcalls. llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); return Builder.CreateCall(DMB, CDomain); } } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/non-atomic"); case AtomicOrdering::Monotonic: case AtomicOrdering::Acquire: return nullptr; // Nothing to do case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do LLVM_FALLTHROUGH; case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: if (Subtarget->preferISHSTBarriers()) return makeDMB(Builder, ARM_MB::ISHST); // FIXME: add a comment with a link to documentation justifying this. else return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/not-atomic"); case AtomicOrdering::Monotonic: case AtomicOrdering::Release: return nullptr; // Nothing to do case AtomicOrdering::Acquire: case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitTrailingFence"); } // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); return (Size == 64) && !Subtarget->isMClass(); } // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that // guarantee, see DDI0406C ARM architecture reference manual, // sections A8.8.72-74 LDRD) TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (AI->isFloatingPointOperation()) return AtomicExpansionKind::CmpXChg; unsigned Size = AI->getType()->getPrimitiveSizeInBits(); bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. bool HasAtomicCmpXchg = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) return AtomicExpansionKind::LLSC; return AtomicExpansionKind::None; } bool ARMTargetLowering::shouldInsertFencesForAtomic( const Instruction *I) const { return InsertFencesForAtomic; } // This has so far only been implemented for MachO. bool ARMTargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO(); } void ARMTargetLowering::insertSSPDeclarations(Module &M) const { if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return TargetLowering::insertSSPDeclarations(M); // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( "__security_check_cookie", Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) F->addAttribute(1, Attribute::AttrKind::InReg); } Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { // MSVC CRT has a global variable holding security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getGlobalVariable("__security_cookie"); return TargetLowering::getSDagStackGuard(M); } Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getFunction("__security_check_cookie"); return TargetLowering::getSSPStackGuardCheck(M); } bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { // If we do not have NEON, vector types are not natively supported. if (!Subtarget->hasNEON()) return false; // Floating point values and vector values map to the same register file. // Therefore, although we could do a store extract of a vector type, this is // better to leave at float as we have more freedom in the addressing mode for // those. if (VectorTy->isFPOrFPVectorTy()) return false; // If the index is unknown at compile time, this is very expensive to lower // and it is not possible to combine the store with the extract. if (!isa(Idx)) return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); unsigned BitWidth = cast(VectorTy)->getBitWidth(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { Cost = 0; return true; } return false; } bool ARMTargetLowering::isCheapToSpeculateCttz() const { return Subtarget->hasV6T2Ops(); } bool ARMTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasV6T2Ops(); } bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { return !Subtarget->hasMinSize(); } Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); bool IsAcquire = isAcquireOrStronger(Ord); // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i32, i32} and we have to recombine them into a // single i64 here. if (ValTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; Function *Ldrex = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldrex, Addr), cast(Addr->getType())->getElementType()); } void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); } Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i64 intrinsics take two // parameters: "i32, i32". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; Function *Strex = Intrinsic::getDeclaration(M, Int); Type *Int32Ty = Type::getInt32Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); if (!Subtarget->isLittle()) std::swap(Lo, Hi); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Strex, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; Type *Tys[] = { Addr->getType() }; Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateCall( Strex, {Builder.CreateZExtOrBitCast( Val, Strex->getFunctionType()->getParamType(0)), Addr}); } bool ARMTargetLowering::alignLoopsWithOptSize() const { return Subtarget->isMClass(); } /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const { return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } bool ARMTargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the vector doesn't have f16 elements. Even though we could do an // i16 vldN, we can't hold the f16 vectors and will end up converting via // f32. if (VecTy->getElementType()->isHalfTy()) return false; // Ensure the number of vector elements is greater than 1. if (VecTy->getNumElements() < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32) return false; // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. return VecSize == 64 || VecSize % 128 == 0; } unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { if (Subtarget->hasNEON()) return 4; return TargetLoweringBase::getMaxSupportedInterleaveFactor(); } /// Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); VectorType *VecTy = Shuffles[0]->getType(); Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); IRBuilder<> Builder(LI); // The base address of the load. Value *BaseAddr = LI->getPointerOperand(); if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. VecTy = VectorType::get(VecTy->getVectorElementType(), VecTy->getVectorNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, VecTy->getVectorElementType()->getPointerTo( LI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, Int8Ptr}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will // replace. DenseMap> SubVecs; for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) BaseAddr = Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, VecTy->getVectorNumElements() * Factor); SmallVector Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SV = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(VldN, Index); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( SubVec, VectorType::get(SV->getType()->getVectorElementType(), VecTy->getVectorNumElements())); SubVecs[SV].push_back(SubVec); } } // Replace uses of the shufflevector instructions with the sub-vectors // returned by the load intrinsic. If a shufflevector instruction is // associated with more than one sub-vector, those sub-vectors will be // concatenated into a single wide vector. for (ShuffleVectorInst *SVI : Shuffles) { auto &SubVec = SubVecs[SVI]; auto *WideVec = SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; SVI->replaceAllUsesWith(WideVec); } return true; } /// Lower an interleaved store into a vstN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vst3 instruction in CodeGen. /// /// Example for a more general valid mask (Factor 3). Lower: /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); VectorType *VecTy = SVI->getType(); assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. Type *IntVecTy = VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = VectorType::get(IntTy, LaneLen); } // The base address of the store. Value *BaseAddr = SI->getPointerOperand(); if (NumStores > 1) { // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( SI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); auto Mask = SVI->getShuffleMask(); Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {Int8Ptr, SubVecTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), BaseAddr, LaneLen * Factor); SmallVector Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); Function *VstNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { unsigned IdxJ = StoreCount * LaneLen * Factor + j; if (Mask[IdxJ * Factor + IdxI] >= 0) { StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; break; } } // Note: If all elements in a chunk are undefs, StartMask=0! // Note: Filling undef gaps with random elements is ok, since // those elements were being written anyway (with undefs). // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); } return true; } enum HABaseType { HA_UNKNOWN = 0, HA_FLOAT, HA_DOUBLE, HA_VECT64, HA_VECT128 }; static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members) { if (auto *ST = dyn_cast(Ty)) { for (unsigned i = 0; i < ST->getNumElements(); ++i) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) return false; Members += SubMembers; } } else if (auto *AT = dyn_cast(Ty)) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) return false; Members += SubMembers * AT->getNumElements(); } else if (Ty->isFloatTy()) { if (Base != HA_UNKNOWN && Base != HA_FLOAT) return false; Members = 1; Base = HA_FLOAT; } else if (Ty->isDoubleTy()) { if (Base != HA_UNKNOWN && Base != HA_DOUBLE) return false; Members = 1; Base = HA_DOUBLE; } else if (auto *VT = dyn_cast(Ty)) { Members = 1; switch (Base) { case HA_FLOAT: case HA_DOUBLE: return false; case HA_VECT64: return VT->getBitWidth() == 64; case HA_VECT128: return VT->getBitWidth() == 128; case HA_UNKNOWN: switch (VT->getBitWidth()) { case 64: Base = HA_VECT64; return true; case 128: Base = HA_VECT128; return true; default: return false; } } } return (Members > 0 && Members <= 4); } /// Return the correct alignment for the current calling convention. unsigned ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const { if (!ArgTy->isVectorTy()) return DL.getABITypeAlignment(ArgTy); // Avoid over-aligning vector parameters. It would require realigning the // stack and waste space for no real benefit. return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); } /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when /// passing according to AAPCS rules. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { if (getEffectiveCallingConv(CallConv, isVarArg) != CallingConv::ARM_AAPCS_VFP) return false; HABaseType Base = HA_UNKNOWN; uint64_t Members = 0; bool IsHA = isHomogeneousAggregate(Ty, Base, Members); LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); return IsHA || IsIntArray; } unsigned ARMTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; } unsigned ARMTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; } void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in ARMFunctionInfo. ARMFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void ARMTargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (ARM::GPRRegClass.contains(*I)) RC = &ARM::GPRRegClass; else if (ARM::DPRRegClass.contains(*I)) RC = &ARM::DPRRegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index d84a235b8b2a..15dcd9d8db27 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -1,856 +1,854 @@ //===- ARMISelLowering.h - ARM DAG Lowering Interface -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that ARM uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H #define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H #include "MCTargetDesc/ARMBaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/MachineValueType.h" #include namespace llvm { class ARMSubtarget; class DataLayout; class FastISel; class FunctionLoweringInfo; class GlobalValue; class InstrItineraryData; class Instruction; class MachineBasicBlock; class MachineInstr; class SelectionDAG; class TargetLibraryInfo; class TargetMachine; class TargetRegisterInfo; class VectorType; namespace ARMISD { // ARM Specific DAG Nodes enum NodeType : unsigned { // Start the numbering where the builtin ops and target ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, Wrapper, // Wrapper - A wrapper node for TargetConstantPool, // TargetExternalSymbol, and TargetGlobalAddress. WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in // PIC mode. WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable // Add pseudo op to model memcpy for struct byval. COPY_STRUCT_BYVAL, CALL, // Function call. CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). RET_FLAG, // Return with a flag operand. INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. PIC_ADD, // Add with a PC operand and a PIC label. ASRL, // MVE long arithmetic shift right. LSRL, // MVE long shift right. LSLL, // MVE long shift left. CMP, // ARM compare instructions. CMN, // ARM CMN instructions. CMPZ, // ARM compare that sets only Z flag. CMPFP, // ARM VFP compare instruction, sets FPSCR. CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. FMSTAT, // ARM fmstat instruction. CMOV, // ARM conditional move instructions. SUBS, // Flag-setting subtraction. SSAT, // Signed saturation USAT, // Unsigned saturation BCC_i64, SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. ADDC, // Add with carry ADDE, // Add using carry SUBC, // Sub with carry SUBE, // Sub using carry LSLS, // Shift left producing carry VMOVRRD, // double to two gprs. VMOVDRR, // Two gprs to double. VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr EH_SJLJ_SETJMP, // SjLj exception handling setjmp. EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. TC_RETURN, // Tail call return pseudo. THREAD_POINTER, DYN_ALLOC, // Dynamic allocation on the stack. MEMBARRIER_MCR, // Memory barrier (MCR) PRELOAD, // Preload WIN__CHKSTK, // Windows' __chkstk call to do stack probing. WIN__DBZCHK, // Windows' divide by zero check WLS, // Low-overhead loops, While Loop Start LOOP_DEC, // Really a part of LE, performs the sub LE, // Low-overhead loops, Loop End PREDICATE_CAST, // Predicate cast for MVE i1 types VCMP, // Vector compare. VCMPZ, // Vector compare to zero. VTST, // Vector test bits. // Vector shift by vector VSHLs, // ...left/right by signed VSHLu, // ...left/right by unsigned // Vector shift by immediate: VSHLIMM, // ...left VSHRsIMM, // ...right (signed) VSHRuIMM, // ...right (unsigned) // Vector rounding shift by immediate: VRSHRsIMM, // ...right (signed) VRSHRuIMM, // ...right (unsigned) VRSHRNIMM, // ...right narrow // Vector saturating shift by immediate: VQSHLsIMM, // ...left (signed) VQSHLuIMM, // ...left (unsigned) VQSHLsuIMM, // ...left (signed to unsigned) VQSHRNsIMM, // ...right narrow (signed) VQSHRNuIMM, // ...right narrow (unsigned) VQSHRNsuIMM, // ...right narrow (signed to unsigned) // Vector saturating rounding shift by immediate: VQRSHRNsIMM, // ...right narrow (signed) VQRSHRNuIMM, // ...right narrow (unsigned) VQRSHRNsuIMM, // ...right narrow (signed to unsigned) // Vector shift and insert: VSLIIMM, // ...left VSRIIMM, // ...right // Vector get lane (VMOV scalar to ARM core register) // (These are used for 8- and 16-bit element types only.) VGETLANEu, // zero-extend vector extract element VGETLANEs, // sign-extend vector extract element // Vector move immediate and move negated immediate: VMOVIMM, VMVNIMM, // Vector move f32 immediate: VMOVFPIMM, // Move H <-> R, clearing top 16 bits VMOVrh, VMOVhr, // Vector duplicate: VDUP, VDUPLANE, // Vector shuffles: VEXT, // extract VREV64, // reverse elements within 64-bit doublewords VREV32, // reverse elements within 32-bit words VREV16, // reverse elements within 16-bit halfwords VZIP, // zip (interleave) VUZP, // unzip (deinterleave) VTRN, // transpose VTBL1, // 1-register shuffle with mask VTBL2, // 2-register shuffle with mask // Vector multiply long: VMULLs, // ...signed VMULLu, // ...unsigned SMULWB, // Signed multiply word by half word, bottom SMULWT, // Signed multiply word by half word, top UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 SMLALTT, // 64-bit signed accumulate multiply top, top 16 SMLALD, // Signed multiply accumulate long dual SMLALDX, // Signed multiply accumulate long dual exchange SMLSLD, // Signed multiply subtract long dual SMLSLDX, // Signed multiply subtract long dual exchange SMMLAR, // Signed multiply long, round and add SMMLSR, // Signed multiply long, subtract and round // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other // operations, but for ARM some BUILD_VECTORs are legal as-is and their // operands need to be legalized. Define an ARM-specific version of // BUILD_VECTOR for this purpose. BUILD_VECTOR, // Bit-field insert BFI, // Vector OR with immediate VORRIMM, // Vector AND with NOT of immediate VBICIMM, // Vector bitwise select VBSL, // Pseudo-instruction representing a memory copy using ldm/stm // instructions. MEMCPY, // Vector load N-element structure to all lanes: VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD2DUP, VLD3DUP, VLD4DUP, // NEON loads with post-increment base updates: VLD1_UPD, VLD2_UPD, VLD3_UPD, VLD4_UPD, VLD2LN_UPD, VLD3LN_UPD, VLD4LN_UPD, VLD1DUP_UPD, VLD2DUP_UPD, VLD3DUP_UPD, VLD4DUP_UPD, // NEON stores with post-increment base updates: VST1_UPD, VST2_UPD, VST3_UPD, VST4_UPD, VST2LN_UPD, VST3LN_UPD, VST4LN_UPD }; } // end namespace ARMISD /// Define some predicates that are used for node matching. namespace ARM { bool isBitFieldInvertedMask(unsigned v); } // end namespace ARM //===--------------------------------------------------------------------===// // ARMTargetLowering - ARM Implementation of the TargetLowering interface class ARMTargetLowering : public TargetLowering { public: explicit ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI); unsigned getJumpTableEncoding() const override; bool useSoftFloat() const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; bool isSelectSupported(SelectSupportKind Kind) const override { // ARM does not support scalar condition selects on vectors. return (Kind != ScalarCondVectorVal); } bool isReadOnly(const GlobalValue *GV) const; /// getSetCCResultType - Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses of the specified type. Returns whether it /// is "fast" by reference in the second argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; bool isFNegFree(EVT VT) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; /// getScalingFactorCost - Return the cost of the scaling used in /// addressing mode represented by AM. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, the return value must be negative. int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// Returns true if the addresing mode representing by AM is legal /// for the Thumb1 target, for a load/store of the specified type. bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can /// compare a register against the immediate without having to materialize /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; /// isLegalAddImmediate - Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize /// the immediate into a register. bool isLegalAddImmediate(int64_t Imm) const override; /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override; bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; bool ExpandInlineAsm(CallInst *CI) const override; ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. ConstraintWeight getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const override; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; const char *LowerXConstraint(EVT ConstraintVT) const override; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is /// true it means one of the asm constraint of the inline asm instruction /// being processed is 'm'. void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; else if (ConstraintCode == "o") return InlineAsm::Constraint_o; else if (ConstraintCode.size() == 2) { if (ConstraintCode[0] == 'U') { switch(ConstraintCode[1]) { default: break; case 'm': return InlineAsm::Constraint_Um; case 'n': return InlineAsm::Constraint_Un; case 'q': return InlineAsm::Constraint_Uq; case 's': return InlineAsm::Constraint_Us; case 't': return InlineAsm::Constraint_Ut; case 'v': return InlineAsm::Constraint_Uv; case 'y': return InlineAsm::Constraint_Uy; } } } return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } const ARMSubtarget* getSubtarget() const { return Subtarget; } /// getRegClassFor - Return the register class that should be used for the /// specified value type. const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent = false) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const override; /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; Sched::Preference getSchedulingPreference(SDNode *N) const override; bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize = false) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override; /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; /// Returns true if an argument of type Ty needs to be passed in a /// contiguous block of registers in calling convention CallConv. bool functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const override; /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override; Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override; bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; Function *getSSPStackGuardCheck(const Module &M) const override; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override { // Do not merge to larger than i32. return (MemVT.getSizeInBits() <= 32); } bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } bool supportSwiftError() const override { return true; } bool hasStandaloneRem(EVT VT) const override { return HasStandaloneRem; } bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const; /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const; bool alignLoopsWithOptSize() const override; /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; void finalizeLowering(MachineFunction &MF) const override; /// Return the correct alignment for the current calling convention. unsigned getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const override; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; bool preferIncOfAddToSubOfNot(EVT VT) const override; protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; private: /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. const ARMSubtarget *Subtarget; const TargetRegisterInfo *RegInfo; const InstrItineraryData *Itins; /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. unsigned ARMPCLabelIndex; // TODO: remove this, and have shouldInsertFencesForAtomic do the proper // check. bool InsertFencesForAtomic; bool HasStandaloneRem = true; void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); std::pair getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const; using RegsToPassVector = SmallVector, 8>; void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const; SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &dl) const; CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const; SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const; SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl &Results) const; SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const; SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; void lowerABS(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is /// expanded to fmul + fadd. /// /// ARM supports both fused and unfused multiply-add operations; we already /// lower a pair of fmul and fadd to the latter so it's not clear that there /// would be a gain or that the gain would be worthwhile enough to risk /// correctness bugs. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; } SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, int ArgOffset, unsigned ArgSize) const; void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable = false) const; SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; /// HandleByVal - Target-specific cleanup for ByVal support. void HandleByVal(CCState *, unsigned &, unsigned) const override; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG, const bool isIndirect) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool shouldConsiderGEPOffsetSplit() const override { return true; } bool isUnsupportedFloatingType(EVT VT) const; SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, bool InvalidOnQNaN) const; SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const; void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitStructByval(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; void addMVEVectorTypes(bool HasMVEFP); void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); void setAllExpand(MVT VT); }; enum VMOVModImmType { VMOVModImm, VMVNModImm, MVEVMVNModImm, OtherModImm }; namespace ARM { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } // end namespace ARM } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index ac1be46447f3..7783bedc2101 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -1,6192 +1,6186 @@ //===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file describes the ARM instructions in TableGen format. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // ARM specific DAG Nodes. // // Type profiles. def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_ARMStructByVal : SDTypeProfile<0, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; def SDT_ARMCMov : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; def SDT_ARMBrcond : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>; def SDT_ARMBrJT : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_ARMBr2JT : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDT_ARMBCC_i64 : SDTypeProfile<0, 6, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>, SDTCisVT<5, OtherVT>]>; def SDT_ARMAnd : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; def SDT_ARMFCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>; def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>; def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisVT<1, i32>]>; // SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>]>; // ARMlsll, ARMlsrl, ARMasrl def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisInt<4>]>; // TODO Add another operand for 'Size' so that we can re-use this node when we // start supporting *TP versions. def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, OtherVT>]>; def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>; def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>; def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>; def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>; def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; def ARMsmmlar : SDNode<"ARMISD::SMMLAR", SDT_MulHSR>; def ARMsmmlsr : SDNode<"ARMISD::SMMLSR", SDT_MulHSR>; // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntUnaryOp>; def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" , SDT_ARMStructByVal, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>; def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; def ARMusatnoshift : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>; def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, [SDNPHasChain]>; def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, [SDNPHasChain]>; def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, [SDNPHasChain]>; def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, [SDNPOutGlue]>; def ARMcmn : SDNode<"ARMISD::CMN", SDT_ARMCmp, [SDNPOutGlue]>; def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, [SDNPOutGlue, SDNPCommutative]>; def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; def ARMasrl : SDNode<"ARMISD::ASRL", SDT_ARMIntShiftParts, []>; def ARMlsrl : SDNode<"ARMISD::LSRL", SDT_ARMIntShiftParts, []>; def ARMlsll : SDNode<"ARMISD::LSLL", SDT_ARMIntShiftParts, []>; def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags, [SDNPCommutative]>; def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>; def ARMlsls : SDNode<"ARMISD::LSLS", SDTBinaryArithWithFlags>; def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>; def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>; def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain, SDNPSideEffect]>; def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain, SDNPSideEffect]>; def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH", SDT_ARMEH_SJLJ_SetupDispatch, [SDNPHasChain, SDNPSideEffect]>; def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, [SDNPHasChain, SDNPSideEffect]>; def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; def ARMsmulwb : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>; def ARMsmulwt : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>; def ARMsmlalbb : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>; def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; // Vector operations shared between NEON and MVE def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; // VDUPLANE can produce a quad-register result from a double-register source, // so the result is not constrained to match the source. def ARMvduplane : SDNode<"ARMISD::VDUPLANE", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i32>]>>; def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, SDTCisVT<2, i32>]>; def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; def ARMvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,]>; def ARMvshlImm : SDNode<"ARMISD::VSHLIMM", SDTARMVSHIMM>; def ARMvshrsImm : SDNode<"ARMISD::VSHRsIMM", SDTARMVSHIMM>; def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>; def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>; def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>; def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>; def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>; def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // ARM Flag Definitions. class RegConstraint { string Constraints = C; } //===----------------------------------------------------------------------===// // ARM specific transformation functions and pattern fragments. // // imm_neg_XFORM - Return the negation of an i32 immediate value. def imm_neg_XFORM : SDNodeXFormgetTargetConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32); }]>; // imm_not_XFORM - Return the complement of a i32 immediate value. def imm_not_XFORM : SDNodeXFormgetTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32); }]>; /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. def imm16_31 : ImmLeaf= 16 && (int32_t)Imm < 32; }]>; // sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. def sext_16_node : PatLeaf<(i32 GPR:$a), [{ return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; }]>; def sext_bottom_16 : PatFrag<(ops node:$a), (sext_inreg node:$a, i16)>; def sext_top_16 : PatFrag<(ops node:$a), (i32 (sra node:$a, (i32 16)))>; def bb_mul : PatFrag<(ops node:$a, node:$b), (mul (sext_bottom_16 node:$a), (sext_bottom_16 node:$b))>; def bt_mul : PatFrag<(ops node:$a, node:$b), (mul (sext_bottom_16 node:$a), (sra node:$b, (i32 16)))>; def tb_mul : PatFrag<(ops node:$a, node:$b), (mul (sra node:$a, (i32 16)), (sext_bottom_16 node:$b))>; def tt_mul : PatFrag<(ops node:$a, node:$b), (mul (sra node:$a, (i32 16)), (sra node:$b, (i32 16)))>; /// Split a 32-bit immediate into two 16 bit parts. def hi16 : SDNodeXFormgetTargetConstant((uint32_t)N->getZExtValue() >> 16, SDLoc(N), MVT::i32); }]>; def lo16AllZero : PatLeaf<(i32 imm), [{ // Returns true if all low 16-bits are 0. return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; }], hi16>; class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag : PatFrag<(ops node:$Src), res>; // An 'and' node with a single use. def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ return N->hasOneUse(); }]>; // An 'xor' node with a single use. def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{ return N->hasOneUse(); }]>; // An 'fmul' node with a single use. def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{ return N->hasOneUse(); }]>; // An 'fadd' node which checks for single non-hazardous use. def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; // An 'fsub' node which checks for single non-hazardous use. def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; //===----------------------------------------------------------------------===// // Operand Definitions. // // Immediate operands with a shared generic asm render method. class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; let PredicateMethod = "isImmediate<" # Low # "," # High # ">"; let DiagnosticString = "operand must be an immediate in the range [" # Low # "," # High # "]"; } class ImmAsmOperandMinusOne : AsmOperandClass { let PredicateMethod = "isImmediate<" # Low # "," # High # ">"; let DiagnosticType = "ImmRange" # Low # "_" # High; let DiagnosticString = "operand must be an immediate in the range [" # Low # "," # High # "]"; } // Operands that are part of a memory addressing mode. class MemOperand : Operand { let OperandType = "OPERAND_MEMORY"; } // Branch target. // FIXME: rename brtarget to t2_brtarget def brtarget : Operand { let EncoderMethod = "getBranchTargetOpValue"; let OperandType = "OPERAND_PCREL"; let DecoderMethod = "DecodeT2BROperand"; } // Branches targeting ARM-mode must be divisible by 4 if they're a raw // immediate. def ARMBranchTarget : AsmOperandClass { let Name = "ARMBranchTarget"; } // Branches targeting Thumb-mode must be divisible by 2 if they're a raw // immediate. def ThumbBranchTarget : AsmOperandClass { let Name = "ThumbBranchTarget"; } def arm_br_target : Operand { let ParserMatchClass = ARMBranchTarget; let EncoderMethod = "getARMBranchTargetOpValue"; let OperandType = "OPERAND_PCREL"; } // Call target for ARM. Handles conditional/unconditional // FIXME: rename bl_target to t2_bltarget? def arm_bl_target : Operand { let ParserMatchClass = ARMBranchTarget; let EncoderMethod = "getARMBLTargetOpValue"; let OperandType = "OPERAND_PCREL"; } // Target for BLX *from* ARM mode. def arm_blx_target : Operand { let ParserMatchClass = ThumbBranchTarget; let EncoderMethod = "getARMBLXTargetOpValue"; let OperandType = "OPERAND_PCREL"; } // A list of registers separated by comma. Used by load/store multiple. def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; } def reglist : Operand { let EncoderMethod = "getRegisterListOpValue"; let ParserMatchClass = RegListAsmOperand; let PrintMethod = "printRegisterList"; let DecoderMethod = "DecodeRegListOperand"; } // A list of general purpose registers and APSR separated by comma. // Used by CLRM def RegListWithAPSRAsmOperand : AsmOperandClass { let Name = "RegListWithAPSR"; } def reglist_with_apsr : Operand { let EncoderMethod = "getRegisterListOpValue"; let ParserMatchClass = RegListWithAPSRAsmOperand; let PrintMethod = "printRegisterList"; let DecoderMethod = "DecodeRegListOperand"; } def GPRPairOp : RegisterOperand; def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; let DiagnosticType = "DPR_RegList"; } def dpr_reglist : Operand { let EncoderMethod = "getRegisterListOpValue"; let ParserMatchClass = DPRRegListAsmOperand; let PrintMethod = "printRegisterList"; let DecoderMethod = "DecodeDPRRegListOperand"; } def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; let DiagnosticString = "operand must be a list of registers in range [s0, s31]"; } def spr_reglist : Operand { let EncoderMethod = "getRegisterListOpValue"; let ParserMatchClass = SPRRegListAsmOperand; let PrintMethod = "printRegisterList"; let DecoderMethod = "DecodeSPRRegListOperand"; } def FPSRegListWithVPRAsmOperand : AsmOperandClass { let Name = "FPSRegListWithVPR"; } def fp_sreglist_with_vpr : Operand { let EncoderMethod = "getRegisterListOpValue"; let ParserMatchClass = FPSRegListWithVPRAsmOperand; let PrintMethod = "printRegisterList"; } def FPDRegListWithVPRAsmOperand : AsmOperandClass { let Name = "FPDRegListWithVPR"; } def fp_dreglist_with_vpr : Operand { let EncoderMethod = "getRegisterListOpValue"; let ParserMatchClass = FPDRegListWithVPRAsmOperand; let PrintMethod = "printRegisterList"; } // An operand for the CONSTPOOL_ENTRY pseudo-instruction. def cpinst_operand : Operand { let PrintMethod = "printCPInstOperand"; } // Local PC labels. def pclabel : Operand { let PrintMethod = "printPCLabel"; } // ADR instruction labels. def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; } def adrlabel : Operand { let EncoderMethod = "getAdrLabelOpValue"; let ParserMatchClass = AdrLabelAsmOperand; let PrintMethod = "printAdrLabelOperand<0>"; } def neon_vcvt_imm32 : Operand { let EncoderMethod = "getNEONVcvtImm32OpValue"; let DecoderMethod = "DecodeVCVTImmOperand"; } // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. def rot_imm_XFORM: SDNodeXFormgetZExtValue()){ default: llvm_unreachable(nullptr); case 0: return CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); case 8: return CurDAG->getTargetConstant(1, SDLoc(N), MVT::i32); case 16: return CurDAG->getTargetConstant(2, SDLoc(N), MVT::i32); case 24: return CurDAG->getTargetConstant(3, SDLoc(N), MVT::i32); } }]>; def RotImmAsmOperand : AsmOperandClass { let Name = "RotImm"; let ParserMethod = "parseRotImm"; } def rot_imm : Operand, PatLeaf<(i32 imm), [{ int32_t v = N->getZExtValue(); return v == 8 || v == 16 || v == 24; }], rot_imm_XFORM> { let PrintMethod = "printRotImmOperand"; let ParserMatchClass = RotImmAsmOperand; } // Power-of-two operand for MVE VIDUP and friends, which encode // {1,2,4,8} as its log to base 2, i.e. as {0,1,2,3} respectively def MVE_VIDUP_imm_asmoperand : AsmOperandClass { let Name = "VIDUP_imm"; let PredicateMethod = "isPowerTwoInRange<1,8>"; let RenderMethod = "addPowerTwoOperands"; let DiagnosticString = "vector increment immediate must be 1, 2, 4 or 8"; } def MVE_VIDUP_imm : Operand { let EncoderMethod = "getPowerTwoOpValue"; let DecoderMethod = "DecodePowerTwoOperand<0,3>"; let ParserMatchClass = MVE_VIDUP_imm_asmoperand; } // Pair vector indexing class MVEPairVectorIndexOperand : AsmOperandClass { let Name = "MVEPairVectorIndex"#start; let RenderMethod = "addMVEPairVectorIndexOperands"; let PredicateMethod = "isMVEPairVectorIndex<"#start#", "#end#">"; } class MVEPairVectorIndex : Operand { let PrintMethod = "printVectorIndex"; let EncoderMethod = "getMVEPairVectorIndexOpValue<"#opval#">"; let DecoderMethod = "DecodeMVEPairVectorIndexOperand<"#opval#">"; let MIOperandInfo = (ops i32imm); } def MVEPairVectorIndex0 : MVEPairVectorIndex<"0"> { let ParserMatchClass = MVEPairVectorIndexOperand<"0", "1">; } def MVEPairVectorIndex2 : MVEPairVectorIndex<"2"> { let ParserMatchClass = MVEPairVectorIndexOperand<"2", "3">; } // Vector indexing class MVEVectorIndexOperand : AsmOperandClass { let Name = "MVEVectorIndex"#NumLanes; let RenderMethod = "addMVEVectorIndexOperands"; let PredicateMethod = "isVectorIndexInRange<"#NumLanes#">"; } class MVEVectorIndex : Operand { let PrintMethod = "printVectorIndex"; let ParserMatchClass = MVEVectorIndexOperand; let MIOperandInfo = (ops i32imm); } // shift_imm: An integer that encodes a shift amount and the type of shift // (asr or lsl). The 6-bit immediate encodes as: // {5} 0 ==> lsl // 1 asr // {4-0} imm5 shift amount. // asr #32 encoded as imm5 == 0. def ShifterImmAsmOperand : AsmOperandClass { let Name = "ShifterImm"; let ParserMethod = "parseShifterImm"; } def shift_imm : Operand { let PrintMethod = "printShiftImmOperand"; let ParserMatchClass = ShifterImmAsmOperand; } // shifter_operand operands: so_reg_reg, so_reg_imm, and mod_imm. def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; } def so_reg_reg : Operand, // reg reg imm ComplexPattern { let EncoderMethod = "getSORegRegOpValue"; let PrintMethod = "printSORegRegOperand"; let DecoderMethod = "DecodeSORegRegOperand"; let ParserMatchClass = ShiftedRegAsmOperand; let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm); } def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; } def so_reg_imm : Operand, // reg imm ComplexPattern { let EncoderMethod = "getSORegImmOpValue"; let PrintMethod = "printSORegImmOperand"; let DecoderMethod = "DecodeSORegImmOperand"; let ParserMatchClass = ShiftedImmAsmOperand; let MIOperandInfo = (ops GPR, i32imm); } // FIXME: Does this need to be distinct from so_reg? def shift_so_reg_reg : Operand, // reg reg imm ComplexPattern { let EncoderMethod = "getSORegRegOpValue"; let PrintMethod = "printSORegRegOperand"; let DecoderMethod = "DecodeSORegRegOperand"; let ParserMatchClass = ShiftedRegAsmOperand; let MIOperandInfo = (ops GPR, GPR, i32imm); } // FIXME: Does this need to be distinct from so_reg? def shift_so_reg_imm : Operand, // reg reg imm ComplexPattern { let EncoderMethod = "getSORegImmOpValue"; let PrintMethod = "printSORegImmOperand"; let DecoderMethod = "DecodeSORegImmOperand"; let ParserMatchClass = ShiftedImmAsmOperand; let MIOperandInfo = (ops GPR, i32imm); } // mod_imm: match a 32-bit immediate operand, which can be encoded into // a 12-bit immediate; an 8-bit integer and a 4-bit rotator (See ARMARM // - "Modified Immediate Constants"). Within the MC layer we keep this // immediate in its encoded form. def ModImmAsmOperand: AsmOperandClass { let Name = "ModImm"; let ParserMethod = "parseModImm"; } def mod_imm : Operand, ImmLeaf { let EncoderMethod = "getModImmOpValue"; let PrintMethod = "printModImmOperand"; let ParserMatchClass = ModImmAsmOperand; } // Note: the patterns mod_imm_not and mod_imm_neg do not require an encoder // method and such, as they are only used on aliases (Pat<> and InstAlias<>). // The actual parsing, encoding, decoding are handled by the destination // instructions, which use mod_imm. def ModImmNotAsmOperand : AsmOperandClass { let Name = "ModImmNot"; } def mod_imm_not : Operand, PatLeaf<(imm), [{ return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1; }], imm_not_XFORM> { let ParserMatchClass = ModImmNotAsmOperand; } def ModImmNegAsmOperand : AsmOperandClass { let Name = "ModImmNeg"; } def mod_imm_neg : Operand, PatLeaf<(imm), [{ unsigned Value = -(unsigned)N->getZExtValue(); return Value && ARM_AM::getSOImmVal(Value) != -1; }], imm_neg_XFORM> { let ParserMatchClass = ModImmNegAsmOperand; } /// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal() def arm_i32imm : IntImmLeafuseMovt()) return true; return ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue()); }]>; /// imm0_1 predicate - Immediate in the range [0,1]. def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; } def imm0_1 : Operand { let ParserMatchClass = Imm0_1AsmOperand; } /// imm0_3 predicate - Immediate in the range [0,3]. def Imm0_3AsmOperand: ImmAsmOperand<0,3> { let Name = "Imm0_3"; } def imm0_3 : Operand { let ParserMatchClass = Imm0_3AsmOperand; } /// imm0_7 predicate - Immediate in the range [0,7]. def Imm0_7AsmOperand: ImmAsmOperand<0,7> { let Name = "Imm0_7"; } def imm0_7 : Operand, ImmLeaf= 0 && Imm < 8; }]> { let ParserMatchClass = Imm0_7AsmOperand; } /// imm8_255 predicate - Immediate in the range [8,255]. def Imm8_255AsmOperand: ImmAsmOperand<8,255> { let Name = "Imm8_255"; } def imm8_255 : Operand, ImmLeaf= 8 && Imm < 256; }]> { let ParserMatchClass = Imm8_255AsmOperand; } /// imm8 predicate - Immediate is exactly 8. def Imm8AsmOperand: ImmAsmOperand<8,8> { let Name = "Imm8"; } def imm8 : Operand, ImmLeaf { let ParserMatchClass = Imm8AsmOperand; } /// imm16 predicate - Immediate is exactly 16. def Imm16AsmOperand: ImmAsmOperand<16,16> { let Name = "Imm16"; } def imm16 : Operand, ImmLeaf { let ParserMatchClass = Imm16AsmOperand; } /// imm32 predicate - Immediate is exactly 32. def Imm32AsmOperand: ImmAsmOperand<32,32> { let Name = "Imm32"; } def imm32 : Operand, ImmLeaf { let ParserMatchClass = Imm32AsmOperand; } def imm8_or_16 : ImmLeaf; /// imm1_7 predicate - Immediate in the range [1,7]. def Imm1_7AsmOperand: ImmAsmOperand<1,7> { let Name = "Imm1_7"; } def imm1_7 : Operand, ImmLeaf 0 && Imm < 8; }]> { let ParserMatchClass = Imm1_7AsmOperand; } /// imm1_15 predicate - Immediate in the range [1,15]. def Imm1_15AsmOperand: ImmAsmOperand<1,15> { let Name = "Imm1_15"; } def imm1_15 : Operand, ImmLeaf 0 && Imm < 16; }]> { let ParserMatchClass = Imm1_15AsmOperand; } /// imm1_31 predicate - Immediate in the range [1,31]. def Imm1_31AsmOperand: ImmAsmOperand<1,31> { let Name = "Imm1_31"; } def imm1_31 : Operand, ImmLeaf 0 && Imm < 32; }]> { let ParserMatchClass = Imm1_31AsmOperand; } /// imm0_15 predicate - Immediate in the range [0,15]. def Imm0_15AsmOperand: ImmAsmOperand<0,15> { let Name = "Imm0_15"; } def imm0_15 : Operand, ImmLeaf= 0 && Imm < 16; }]> { let ParserMatchClass = Imm0_15AsmOperand; } /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. def Imm0_31AsmOperand: ImmAsmOperand<0,31> { let Name = "Imm0_31"; } def imm0_31 : Operand, ImmLeaf= 0 && Imm < 32; }]> { let ParserMatchClass = Imm0_31AsmOperand; } /// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32]. def Imm0_32AsmOperand: ImmAsmOperand<0,32> { let Name = "Imm0_32"; } def imm0_32 : Operand, ImmLeaf= 0 && Imm < 33; }]> { let ParserMatchClass = Imm0_32AsmOperand; } /// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63]. def Imm0_63AsmOperand: ImmAsmOperand<0,63> { let Name = "Imm0_63"; } def imm0_63 : Operand, ImmLeaf= 0 && Imm < 64; }]> { let ParserMatchClass = Imm0_63AsmOperand; } /// imm0_239 predicate - Immediate in the range [0,239]. def Imm0_239AsmOperand : ImmAsmOperand<0,239> { let Name = "Imm0_239"; } def imm0_239 : Operand, ImmLeaf= 0 && Imm < 240; }]> { let ParserMatchClass = Imm0_239AsmOperand; } /// imm0_255 predicate - Immediate in the range [0,255]. def Imm0_255AsmOperand : ImmAsmOperand<0,255> { let Name = "Imm0_255"; } def imm0_255 : Operand, ImmLeaf= 0 && Imm < 256; }]> { let ParserMatchClass = Imm0_255AsmOperand; } /// imm0_65535 - An immediate is in the range [0,65535]. def Imm0_65535AsmOperand: ImmAsmOperand<0,65535> { let Name = "Imm0_65535"; } def imm0_65535 : Operand, ImmLeaf= 0 && Imm < 65536; }]> { let ParserMatchClass = Imm0_65535AsmOperand; } // imm0_65535_neg - An immediate whose negative value is in the range [0.65535]. def imm0_65535_neg : Operand, ImmLeaf= 0 && -Imm < 65536; }]>; // imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference // a relocatable expression. // // FIXME: This really needs a Thumb version separate from the ARM version. // While the range is the same, and can thus use the same match class, // the encoding is different so it should have a different encoder method. def Imm0_65535ExprAsmOperand: AsmOperandClass { let Name = "Imm0_65535Expr"; let RenderMethod = "addImmOperands"; let DiagnosticString = "operand must be an immediate in the range [0,0xffff] or a relocatable expression"; } def imm0_65535_expr : Operand { let EncoderMethod = "getHiLo16ImmOpValue"; let ParserMatchClass = Imm0_65535ExprAsmOperand; } def Imm256_65535ExprAsmOperand: ImmAsmOperand<256,65535> { let Name = "Imm256_65535Expr"; } def imm256_65535_expr : Operand { let ParserMatchClass = Imm256_65535ExprAsmOperand; } /// imm24b - True if the 32-bit immediate is encodable in 24 bits. def Imm24bitAsmOperand: ImmAsmOperand<0,0xffffff> { let Name = "Imm24bit"; let DiagnosticString = "operand must be an immediate in the range [0,0xffffff]"; } def imm24b : Operand, ImmLeaf= 0 && Imm <= 0xffffff; }]> { let ParserMatchClass = Imm24bitAsmOperand; } /// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield /// e.g., 0xf000ffff def BitfieldAsmOperand : AsmOperandClass { let Name = "Bitfield"; let ParserMethod = "parseBitfield"; } def bf_inv_mask_imm : Operand, PatLeaf<(imm), [{ return ARM::isBitFieldInvertedMask(N->getZExtValue()); }] > { let EncoderMethod = "getBitfieldInvertedMaskOpValue"; let PrintMethod = "printBitfieldInvMaskImmOperand"; let DecoderMethod = "DecodeBitfieldMaskOperand"; let ParserMatchClass = BitfieldAsmOperand; let GISelPredicateCode = [{ // There's better methods of implementing this check. IntImmLeaf<> would be // equivalent and have less boilerplate but we need a test for C++ // predicates and this one causes new rules to be imported into GlobalISel // without requiring additional features first. const auto &MO = MI.getOperand(1); if (!MO.isCImm()) return false; return ARM::isBitFieldInvertedMask(MO.getCImm()->getZExtValue()); }]; } def imm1_32_XFORM: SDNodeXFormgetTargetConstant((int)N->getZExtValue() - 1, SDLoc(N), MVT::i32); }]>; def Imm1_32AsmOperand: ImmAsmOperandMinusOne<1,32> { let Name = "Imm1_32"; } def imm1_32 : Operand, PatLeaf<(imm), [{ uint64_t Imm = N->getZExtValue(); return Imm > 0 && Imm <= 32; }], imm1_32_XFORM> { let PrintMethod = "printImmPlusOneOperand"; let ParserMatchClass = Imm1_32AsmOperand; } def imm1_16_XFORM: SDNodeXFormgetTargetConstant((int)N->getZExtValue() - 1, SDLoc(N), MVT::i32); }]>; def Imm1_16AsmOperand: ImmAsmOperandMinusOne<1,16> { let Name = "Imm1_16"; } def imm1_16 : Operand, ImmLeaf 0 && Imm <= 16; }], imm1_16_XFORM> { let PrintMethod = "printImmPlusOneOperand"; let ParserMatchClass = Imm1_16AsmOperand; } def MVEShiftImm1_7AsmOperand: ImmAsmOperand<1,7> { let Name = "MVEShiftImm1_7"; // Reason we're doing this is because instruction vshll.s8 t1 encoding // accepts 1,7 but the t2 encoding accepts 8. By doing this we can get a // better diagnostic message if someone uses bigger immediate than the t1/t2 // encodings allow. let DiagnosticString = "operand must be an immediate in the range [1,8]"; } def mve_shift_imm1_7 : Operand { let ParserMatchClass = MVEShiftImm1_7AsmOperand; let EncoderMethod = "getMVEShiftImmOpValue"; } def MVEShiftImm1_15AsmOperand: ImmAsmOperand<1,15> { let Name = "MVEShiftImm1_15"; // Reason we're doing this is because instruction vshll.s16 t1 encoding // accepts 1,15 but the t2 encoding accepts 16. By doing this we can get a // better diagnostic message if someone uses bigger immediate than the t1/t2 // encodings allow. let DiagnosticString = "operand must be an immediate in the range [1,16]"; } def mve_shift_imm1_15 : Operand { let ParserMatchClass = MVEShiftImm1_15AsmOperand; let EncoderMethod = "getMVEShiftImmOpValue"; } // Define ARM specific addressing modes. // addrmode_imm12 := reg +/- imm12 // def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; } class AddrMode_Imm12 : MemOperand, ComplexPattern { // 12-bit immediate operand. Note that instructions using this encode // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other // immediate values are as normal. let EncoderMethod = "getAddrModeImm12OpValue"; let DecoderMethod = "DecodeAddrModeImm12Operand"; let ParserMatchClass = MemImm12OffsetAsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); } def addrmode_imm12 : AddrMode_Imm12 { let PrintMethod = "printAddrModeImm12Operand"; } def addrmode_imm12_pre : AddrMode_Imm12 { let PrintMethod = "printAddrModeImm12Operand"; } // ldst_so_reg := reg +/- reg shop imm // def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; } def ldst_so_reg : MemOperand, ComplexPattern { let EncoderMethod = "getLdStSORegOpValue"; // FIXME: Simplify the printer let PrintMethod = "printAddrMode2Operand"; let DecoderMethod = "DecodeSORegMemOperand"; let ParserMatchClass = MemRegOffsetAsmOperand; let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift); } // postidx_imm8 := +/- [0,255] // // 9 bit value: // {8} 1 is imm8 is non-negative. 0 otherwise. // {7-0} [0,255] imm8 value. def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; } def postidx_imm8 : MemOperand { let PrintMethod = "printPostIdxImm8Operand"; let ParserMatchClass = PostIdxImm8AsmOperand; let MIOperandInfo = (ops i32imm); } // postidx_imm8s4 := +/- [0,1020] // // 9 bit value: // {8} 1 is imm8 is non-negative. 0 otherwise. // {7-0} [0,255] imm8 value, scaled by 4. def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; } def postidx_imm8s4 : MemOperand { let PrintMethod = "printPostIdxImm8s4Operand"; let ParserMatchClass = PostIdxImm8s4AsmOperand; let MIOperandInfo = (ops i32imm); } // postidx_reg := +/- reg // def PostIdxRegAsmOperand : AsmOperandClass { let Name = "PostIdxReg"; let ParserMethod = "parsePostIdxReg"; } def postidx_reg : MemOperand { let EncoderMethod = "getPostIdxRegOpValue"; let DecoderMethod = "DecodePostIdxReg"; let PrintMethod = "printPostIdxRegOperand"; let ParserMatchClass = PostIdxRegAsmOperand; let MIOperandInfo = (ops GPRnopc, i32imm); } def PostIdxRegShiftedAsmOperand : AsmOperandClass { let Name = "PostIdxRegShifted"; let ParserMethod = "parsePostIdxReg"; } def am2offset_reg : MemOperand, ComplexPattern { let EncoderMethod = "getAddrMode2OffsetOpValue"; let PrintMethod = "printAddrMode2OffsetOperand"; // When using this for assembly, it's always as a post-index offset. let ParserMatchClass = PostIdxRegShiftedAsmOperand; let MIOperandInfo = (ops GPRnopc, i32imm); } // FIXME: am2offset_imm should only need the immediate, not the GPR. Having // the GPR is purely vestigal at this point. def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; } def am2offset_imm : MemOperand, ComplexPattern { let EncoderMethod = "getAddrMode2OffsetOpValue"; let PrintMethod = "printAddrMode2OffsetOperand"; let ParserMatchClass = AM2OffsetImmAsmOperand; let MIOperandInfo = (ops GPRnopc, i32imm); } // addrmode3 := reg +/- reg // addrmode3 := reg +/- imm8 // // FIXME: split into imm vs. reg versions. def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; } class AddrMode3 : MemOperand, ComplexPattern { let EncoderMethod = "getAddrMode3OpValue"; let ParserMatchClass = AddrMode3AsmOperand; let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); } def addrmode3 : AddrMode3 { let PrintMethod = "printAddrMode3Operand"; } def addrmode3_pre : AddrMode3 { let PrintMethod = "printAddrMode3Operand"; } // FIXME: split into imm vs. reg versions. // FIXME: parser method to handle +/- register. def AM3OffsetAsmOperand : AsmOperandClass { let Name = "AM3Offset"; let ParserMethod = "parseAM3Offset"; } def am3offset : MemOperand, ComplexPattern { let EncoderMethod = "getAddrMode3OffsetOpValue"; let PrintMethod = "printAddrMode3OffsetOperand"; let ParserMatchClass = AM3OffsetAsmOperand; let MIOperandInfo = (ops GPR, i32imm); } // ldstm_mode := {ia, ib, da, db} // def ldstm_mode : OptionalDefOperand { let EncoderMethod = "getLdStmModeOpValue"; let PrintMethod = "printLdStmModeOperand"; } // addrmode5 := reg +/- imm8*4 // def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; } class AddrMode5 : MemOperand, ComplexPattern { let EncoderMethod = "getAddrMode5OpValue"; let DecoderMethod = "DecodeAddrMode5Operand"; let ParserMatchClass = AddrMode5AsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm); } def addrmode5 : AddrMode5 { let PrintMethod = "printAddrMode5Operand"; } def addrmode5_pre : AddrMode5 { let PrintMethod = "printAddrMode5Operand"; } // addrmode5fp16 := reg +/- imm8*2 // def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; } class AddrMode5FP16 : Operand, ComplexPattern { let EncoderMethod = "getAddrMode5FP16OpValue"; let DecoderMethod = "DecodeAddrMode5FP16Operand"; let ParserMatchClass = AddrMode5FP16AsmOperand; let MIOperandInfo = (ops GPR:$base, i32imm); } def addrmode5fp16 : AddrMode5FP16 { let PrintMethod = "printAddrMode5FP16Operand"; } // addrmode6 := reg with optional alignment // def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; } def addrmode6 : MemOperand, ComplexPattern{ let PrintMethod = "printAddrMode6Operand"; let MIOperandInfo = (ops GPR:$addr, i32imm:$align); let EncoderMethod = "getAddrMode6AddressOpValue"; let DecoderMethod = "DecodeAddrMode6Operand"; let ParserMatchClass = AddrMode6AsmOperand; } def am6offset : MemOperand, ComplexPattern { let PrintMethod = "printAddrMode6OffsetOperand"; let MIOperandInfo = (ops GPR); let EncoderMethod = "getAddrMode6OffsetOpValue"; let DecoderMethod = "DecodeGPRRegisterClass"; } // Special version of addrmode6 to handle alignment encoding for VST1/VLD1 // (single element from one lane) for size 32. def addrmode6oneL32 : MemOperand, ComplexPattern{ let PrintMethod = "printAddrMode6Operand"; let MIOperandInfo = (ops GPR:$addr, i32imm); let EncoderMethod = "getAddrMode6OneLane32AddressOpValue"; } // Base class for addrmode6 with specific alignment restrictions. class AddrMode6Align : MemOperand, ComplexPattern{ let PrintMethod = "printAddrMode6Operand"; let MIOperandInfo = (ops GPR:$addr, i32imm:$align); let EncoderMethod = "getAddrMode6AddressOpValue"; let DecoderMethod = "DecodeAddrMode6Operand"; } // Special version of addrmode6 to handle no allowed alignment encoding for // VLD/VST instructions and checking the alignment is not specified. def AddrMode6AlignNoneAsmOperand : AsmOperandClass { let Name = "AlignedMemoryNone"; let DiagnosticString = "alignment must be omitted"; } def addrmode6alignNone : AddrMode6Align { // The alignment specifier can only be omitted. let ParserMatchClass = AddrMode6AlignNoneAsmOperand; } // Special version of addrmode6 to handle 16-bit alignment encoding for // VLD/VST instructions and checking the alignment value. def AddrMode6Align16AsmOperand : AsmOperandClass { let Name = "AlignedMemory16"; let DiagnosticString = "alignment must be 16 or omitted"; } def addrmode6align16 : AddrMode6Align { // The alignment specifier can only be 16 or omitted. let ParserMatchClass = AddrMode6Align16AsmOperand; } // Special version of addrmode6 to handle 32-bit alignment encoding for // VLD/VST instructions and checking the alignment value. def AddrMode6Align32AsmOperand : AsmOperandClass { let Name = "AlignedMemory32"; let DiagnosticString = "alignment must be 32 or omitted"; } def addrmode6align32 : AddrMode6Align { // The alignment specifier can only be 32 or omitted. let ParserMatchClass = AddrMode6Align32AsmOperand; } // Special version of addrmode6 to handle 64-bit alignment encoding for // VLD/VST instructions and checking the alignment value. def AddrMode6Align64AsmOperand : AsmOperandClass { let Name = "AlignedMemory64"; let DiagnosticString = "alignment must be 64 or omitted"; } def addrmode6align64 : AddrMode6Align { // The alignment specifier can only be 64 or omitted. let ParserMatchClass = AddrMode6Align64AsmOperand; } // Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding // for VLD/VST instructions and checking the alignment value. def AddrMode6Align64or128AsmOperand : AsmOperandClass { let Name = "AlignedMemory64or128"; let DiagnosticString = "alignment must be 64, 128 or omitted"; } def addrmode6align64or128 : AddrMode6Align { // The alignment specifier can only be 64, 128 or omitted. let ParserMatchClass = AddrMode6Align64or128AsmOperand; } // Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment // encoding for VLD/VST instructions and checking the alignment value. def AddrMode6Align64or128or256AsmOperand : AsmOperandClass { let Name = "AlignedMemory64or128or256"; let DiagnosticString = "alignment must be 64, 128, 256 or omitted"; } def addrmode6align64or128or256 : AddrMode6Align { // The alignment specifier can only be 64, 128, 256 or omitted. let ParserMatchClass = AddrMode6Align64or128or256AsmOperand; } // Special version of addrmode6 to handle alignment encoding for VLD-dup // instructions, specifically VLD4-dup. def addrmode6dup : MemOperand, ComplexPattern{ let PrintMethod = "printAddrMode6Operand"; let MIOperandInfo = (ops GPR:$addr, i32imm); let EncoderMethod = "getAddrMode6DupAddressOpValue"; // FIXME: This is close, but not quite right. The alignment specifier is // different. let ParserMatchClass = AddrMode6AsmOperand; } // Base class for addrmode6dup with specific alignment restrictions. class AddrMode6DupAlign : MemOperand, ComplexPattern{ let PrintMethod = "printAddrMode6Operand"; let MIOperandInfo = (ops GPR:$addr, i32imm); let EncoderMethod = "getAddrMode6DupAddressOpValue"; } // Special version of addrmode6 to handle no allowed alignment encoding for // VLD-dup instruction and checking the alignment is not specified. def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass { let Name = "DupAlignedMemoryNone"; let DiagnosticString = "alignment must be omitted"; } def addrmode6dupalignNone : AddrMode6DupAlign { // The alignment specifier can only be omitted. let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand; } // Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup // instruction and checking the alignment value. def AddrMode6dupAlign16AsmOperand : AsmOperandClass { let Name = "DupAlignedMemory16"; let DiagnosticString = "alignment must be 16 or omitted"; } def addrmode6dupalign16 : AddrMode6DupAlign { // The alignment specifier can only be 16 or omitted. let ParserMatchClass = AddrMode6dupAlign16AsmOperand; } // Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup // instruction and checking the alignment value. def AddrMode6dupAlign32AsmOperand : AsmOperandClass { let Name = "DupAlignedMemory32"; let DiagnosticString = "alignment must be 32 or omitted"; } def addrmode6dupalign32 : AddrMode6DupAlign { // The alignment specifier can only be 32 or omitted. let ParserMatchClass = AddrMode6dupAlign32AsmOperand; } // Special version of addrmode6 to handle 64-bit alignment encoding for VLD // instructions and checking the alignment value. def AddrMode6dupAlign64AsmOperand : AsmOperandClass { let Name = "DupAlignedMemory64"; let DiagnosticString = "alignment must be 64 or omitted"; } def addrmode6dupalign64 : AddrMode6DupAlign { // The alignment specifier can only be 64 or omitted. let ParserMatchClass = AddrMode6dupAlign64AsmOperand; } // Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding // for VLD instructions and checking the alignment value. def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass { let Name = "DupAlignedMemory64or128"; let DiagnosticString = "alignment must be 64, 128 or omitted"; } def addrmode6dupalign64or128 : AddrMode6DupAlign { // The alignment specifier can only be 64, 128 or omitted. let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand; } // addrmodepc := pc + reg // def addrmodepc : MemOperand, ComplexPattern { let PrintMethod = "printAddrModePCOperand"; let MIOperandInfo = (ops GPR, i32imm); } // addr_offset_none := reg // def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; } def addr_offset_none : MemOperand, ComplexPattern { let PrintMethod = "printAddrMode7Operand"; let DecoderMethod = "DecodeAddrMode7Operand"; let ParserMatchClass = MemNoOffsetAsmOperand; let MIOperandInfo = (ops GPR:$base); } // t_addr_offset_none := reg [r0-r7] def MemNoOffsetTAsmOperand : AsmOperandClass { let Name = "MemNoOffsetT"; } def t_addr_offset_none : MemOperand { let PrintMethod = "printAddrMode7Operand"; let DecoderMethod = "DecodetGPRRegisterClass"; let ParserMatchClass = MemNoOffsetTAsmOperand; let MIOperandInfo = (ops tGPR:$base); } def nohash_imm : Operand { let PrintMethod = "printNoHashImmediate"; } def CoprocNumAsmOperand : AsmOperandClass { let Name = "CoprocNum"; let ParserMethod = "parseCoprocNumOperand"; } def p_imm : Operand { let PrintMethod = "printPImmediate"; let ParserMatchClass = CoprocNumAsmOperand; let DecoderMethod = "DecodeCoprocessor"; } def CoprocRegAsmOperand : AsmOperandClass { let Name = "CoprocReg"; let ParserMethod = "parseCoprocRegOperand"; } def c_imm : Operand { let PrintMethod = "printCImmediate"; let ParserMatchClass = CoprocRegAsmOperand; } def CoprocOptionAsmOperand : AsmOperandClass { let Name = "CoprocOption"; let ParserMethod = "parseCoprocOptionOperand"; } def coproc_option_imm : Operand { let PrintMethod = "printCoprocOptionImm"; let ParserMatchClass = CoprocOptionAsmOperand; } //===----------------------------------------------------------------------===// include "ARMInstrFormats.td" //===----------------------------------------------------------------------===// // Multiclass helpers... // /// AsI1_bin_irs - Defines a set of (op r, {mod_imm|r|so_reg}) patterns for a /// binop that produces a value. let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AsI1_bin_irs opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, SDPatternOperator opnode, bit Commutable = 0> { // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { def ri : AsI1, Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; let Inst{25} = 1; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-0} = imm; } } def rr : AsI1, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; let Inst{25} = 0; let isCommutable = Commutable; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-4} = 0b00000000; let Inst{3-0} = Rm; } def rsi : AsI1, Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-5} = shift{11-5}; let Inst{4} = 0; let Inst{3-0} = shift{3-0}; } def rsr : AsI1, Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-8} = shift{11-8}; let Inst{7} = 0; let Inst{6-5} = shift{6-5}; let Inst{4} = 1; let Inst{3-0} = shift{3-0}; } } /// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are /// reversed. The 'rr' form is only defined for the disassembler; for codegen /// it is equivalent to the AsI1_bin_irs counterpart. let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AsI1_rbin_irs opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, SDNode opnode, bit Commutable = 0> { // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { def ri : AsI1, Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; let Inst{25} = 1; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-0} = imm; } } def rr : AsI1, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; let Inst{11-4} = 0b00000000; let Inst{25} = 0; let Inst{3-0} = Rm; let Inst{15-12} = Rd; let Inst{19-16} = Rn; } def rsi : AsI1, Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-5} = shift{11-5}; let Inst{4} = 0; let Inst{3-0} = shift{3-0}; } def rsr : AsI1, Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-8} = shift{11-8}; let Inst{7} = 0; let Inst{6-5} = shift{6-5}; let Inst{4} = 1; let Inst{3-0} = shift{3-0}; } } /// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default. /// /// These opcodes will be converted to the real non-S opcodes by /// AdjustInstrPostInstrSelection after giving them an optional CPSR operand. let hasPostISelHook = 1, Defs = [CPSR] in { multiclass AsI1_bin_s_irs { def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p), 4, iii, [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm))]>, Sched<[WriteALU, ReadALU]>; def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p), 4, iir, [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>, Sched<[WriteALU, ReadALU, ReadALU]> { let isCommutable = Commutable; } def rsi : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift))]>, Sched<[WriteALUsi, ReadALU]>; def rsr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift))]>, Sched<[WriteALUSsr, ReadALUsr]>; } } /// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG /// operands are reversed. let hasPostISelHook = 1, Defs = [CPSR] in { multiclass AsI1_rbin_s_is { def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p), 4, iii, [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>, Sched<[WriteALU, ReadALU]>; def rsi : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn))]>, Sched<[WriteALUsi, ReadALU]>; def rsr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift, pred:$p), 4, iis, [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn))]>, Sched<[WriteALUSsr, ReadALUsr]>; } } /// AI1_cmp_irs - Defines a set of (op r, {mod_imm|r|so_reg}) cmp / test /// patterns. Similar to AsI1_bin_irs except the instruction does not produce /// a explicit result, only implicitly set CPSR. let isCompare = 1, Defs = [CPSR] in { multiclass AI1_cmp_irs opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, SDPatternOperator opnode, bit Commutable = 0, string rrDecoderMethod = ""> { def ri : AI1, Sched<[WriteCMP, ReadALU]> { bits<4> Rn; bits<12> imm; let Inst{25} = 1; let Inst{20} = 1; let Inst{19-16} = Rn; let Inst{15-12} = 0b0000; let Inst{11-0} = imm; let Unpredictable{15-12} = 0b1111; } def rr : AI1, Sched<[WriteCMP, ReadALU, ReadALU]> { bits<4> Rn; bits<4> Rm; let isCommutable = Commutable; let Inst{25} = 0; let Inst{20} = 1; let Inst{19-16} = Rn; let Inst{15-12} = 0b0000; let Inst{11-4} = 0b00000000; let Inst{3-0} = Rm; let DecoderMethod = rrDecoderMethod; let Unpredictable{15-12} = 0b1111; } def rsi : AI1, Sched<[WriteCMPsi, ReadALU]> { bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{20} = 1; let Inst{19-16} = Rn; let Inst{15-12} = 0b0000; let Inst{11-5} = shift{11-5}; let Inst{4} = 0; let Inst{3-0} = shift{3-0}; let Unpredictable{15-12} = 0b1111; } def rsr : AI1, Sched<[WriteCMPsr, ReadALU]> { bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{20} = 1; let Inst{19-16} = Rn; let Inst{15-12} = 0b0000; let Inst{11-8} = shift{11-8}; let Inst{7} = 0; let Inst{6-5} = shift{6-5}; let Inst{4} = 1; let Inst{3-0} = shift{3-0}; let Unpredictable{15-12} = 0b1111; } } } /// AI_ext_rrot - A unary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. /// FIXME: Remove the 'r' variant. Its rot_imm is zero. class AI_ext_rrot opcod, string opc, PatFrag opnode> : AExtI, Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> { bits<4> Rd; bits<4> Rm; bits<2> rot; let Inst{19-16} = 0b1111; let Inst{15-12} = Rd; let Inst{11-10} = rot; let Inst{3-0} = Rm; } class AI_ext_rrot_np opcod, string opc> : AExtI, Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> { bits<2> rot; let Inst{19-16} = 0b1111; let Inst{11-10} = rot; } /// AI_exta_rrot - A binary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. class AI_exta_rrot opcod, string opc, PatFrag opnode> : AExtI, Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> { bits<4> Rd; bits<4> Rm; bits<4> Rn; bits<2> rot; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-10} = rot; let Inst{9-4} = 0b000111; let Inst{3-0} = Rm; } class AI_exta_rrot_np opcod, string opc> : AExtI, Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> { bits<4> Rn; bits<2> rot; let Inst{19-16} = Rn; let Inst{11-10} = rot; } /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AI1_adde_sube_irs opcod, string opc, SDNode opnode, bit Commutable = 0> { let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { def ri : AsI1, Requires<[IsARM]>, Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; let Inst{25} = 1; let Inst{15-12} = Rd; let Inst{19-16} = Rn; let Inst{11-0} = imm; } def rr : AsI1, Requires<[IsARM]>, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; let Inst{11-4} = 0b00000000; let Inst{25} = 0; let isCommutable = Commutable; let Inst{3-0} = Rm; let Inst{15-12} = Rd; let Inst{19-16} = Rn; } def rsi : AsI1, Requires<[IsARM]>, Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-5} = shift{11-5}; let Inst{4} = 0; let Inst{3-0} = shift{3-0}; } def rsr : AsI1, Requires<[IsARM]>, Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-8} = shift{11-8}; let Inst{7} = 0; let Inst{6-5} = shift{6-5}; let Inst{4} = 1; let Inst{3-0} = shift{3-0}; } } } /// AI1_rsc_irs - Define instructions and patterns for rsc let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AI1_rsc_irs opcod, string opc, SDNode opnode> { let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { def ri : AsI1, Requires<[IsARM]>, Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> imm; let Inst{25} = 1; let Inst{15-12} = Rd; let Inst{19-16} = Rn; let Inst{11-0} = imm; } def rr : AsI1, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; let Inst{11-4} = 0b00000000; let Inst{25} = 0; let Inst{3-0} = Rm; let Inst{15-12} = Rd; let Inst{19-16} = Rn; } def rsi : AsI1, Requires<[IsARM]>, Sched<[WriteALUsi, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-5} = shift{11-5}; let Inst{4} = 0; let Inst{3-0} = shift{3-0}; } def rsr : AsI1, Requires<[IsARM]>, Sched<[WriteALUsr, ReadALUsr]> { bits<4> Rd; bits<4> Rn; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{11-8} = shift{11-8}; let Inst{7} = 0; let Inst{6-5} = shift{6-5}; let Inst{4} = 1; let Inst{3-0} = shift{3-0}; } } } let canFoldAsLoad = 1, isReMaterializable = 1 in { multiclass AI_ldr1 { // Note: We use the complex addrmode_imm12 rather than just an input // GPR and a constrained immediate so that we can use this to match // frame index references and avoid matching constant pool references. def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr), AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> { bits<4> Rt; bits<17> addr; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = addr{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift), AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> { bits<4> Rt; bits<17> shift; let shift{4} = 0; // Inst{4} = 0 let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = shift{11-0}; } } } let canFoldAsLoad = 1, isReMaterializable = 1 in { multiclass AI_ldr1nopc { // Note: We use the complex addrmode_imm12 rather than just an input // GPR and a constrained immediate so that we can use this to match // frame index references and avoid matching constant pool references. def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr), AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> { bits<4> Rt; bits<17> addr; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = addr{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift), AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> { bits<4> Rt; bits<17> shift; let shift{4} = 0; // Inst{4} = 0 let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = shift{11-0}; } } } multiclass AI_str1 { // Note: We use the complex addrmode_imm12 rather than just an input // GPR and a constrained immediate so that we can use this to match // frame index references and avoid matching constant pool references. def i12 : AI2ldst<0b010, 0, isByte, (outs), (ins GPR:$Rt, addrmode_imm12:$addr), AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", [(opnode GPR:$Rt, addrmode_imm12:$addr)]> { bits<4> Rt; bits<17> addr; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = addr{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift), AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", [(opnode GPR:$Rt, ldst_so_reg:$shift)]> { bits<4> Rt; bits<17> shift; let shift{4} = 0; // Inst{4} = 0 let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = shift{11-0}; } } multiclass AI_str1nopc { // Note: We use the complex addrmode_imm12 rather than just an input // GPR and a constrained immediate so that we can use this to match // frame index references and avoid matching constant pool references. def i12 : AI2ldst<0b010, 0, isByte, (outs), (ins GPRnopc:$Rt, addrmode_imm12:$addr), AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> { bits<4> Rt; bits<17> addr; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = addr{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift), AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> { bits<4> Rt; bits<17> shift; let shift{4} = 0; // Inst{4} = 0 let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = Rt; let Inst{11-0} = shift{11-0}; } } //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Miscellaneous Instructions. // /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in /// the function. The first operand is the ID# for this instruction, the second /// is the index into the MachineConstantPool that this is, the third is the /// size in bytes of this constant pool entry. let hasSideEffects = 0, isNotDuplicable = 1 in def CONSTPOOL_ENTRY : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; /// A jumptable consisting of direct 32-bit addresses of the destination basic /// blocks (either absolute, or relative to the start of the jump-table in PIC /// mode). Used mostly in ARM and Thumb-1 modes. def JUMPTABLE_ADDRS : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; /// A jumptable consisting of 32-bit jump instructions. Used for Thumb-2 tables /// that cannot be optimised to use TBB or TBH. def JUMPTABLE_INSTS : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; /// A jumptable consisting of 8-bit unsigned integers representing offsets from /// a TBB instruction. def JUMPTABLE_TBB : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; /// A jumptable consisting of 16-bit unsigned integers representing offsets from /// a TBH instruction. def JUMPTABLE_TBH : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; // FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE // from removing one half of the matched pairs. That breaks PEI, which assumes // these will always be in pairs, and asserts if it finds otherwise. Better way? let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { def ADJCALLSTACKUP : PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary, [(ARMcallseq_start timm:$amt, timm:$amt2)]>; } def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>, Requires<[IsARM, HasV6]> { bits<8> imm; let Inst{27-8} = 0b00110010000011110000; let Inst{7-0} = imm; let DecoderMethod = "DecodeHINTInstruction"; } def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>; def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>; def : InstAlias<"csdb$p", (HINT 20, pred:$p)>, Requires<[IsARM, HasV6K]>; def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (int_arm_sel GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; let Inst{3-0} = Rm; let Inst{15-12} = Rd; let Inst{19-16} = Rn; let Inst{27-20} = 0b01101000; let Inst{7-4} = 0b1011; let Inst{11-8} = 0b1111; let Unpredictable{11-8} = 0b1111; } // The 16-bit operand $val can be used by a debugger to store more information // about the breakpoint. def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, "bkpt", "\t$val", []>, Requires<[IsARM]> { bits<16> val; let Inst{3-0} = val{3-0}; let Inst{19-8} = val{15-4}; let Inst{27-20} = 0b00010010; let Inst{31-28} = 0xe; // AL let Inst{7-4} = 0b0111; } // default immediate for breakpoint mnemonic def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>; def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> { bits<16> val; let Inst{3-0} = val{3-0}; let Inst{19-8} = val{15-4}; let Inst{27-20} = 0b00010000; let Inst{31-28} = 0xe; // AL let Inst{7-4} = 0b0111; } // Change Processor State // FIXME: We should use InstAlias to handle the optional operands. class CPS : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops), []>, Requires<[IsARM]> { bits<2> imod; bits<3> iflags; bits<5> mode; bit M; let Inst{31-28} = 0b1111; let Inst{27-20} = 0b00010000; let Inst{19-18} = imod; let Inst{17} = M; // Enabled if mode is set; let Inst{16-9} = 0b00000000; let Inst{8-6} = iflags; let Inst{5} = 0; let Inst{4-0} = mode; } let DecoderMethod = "DecodeCPSInstruction" in { let M = 1 in def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode), "$imod\t$iflags, $mode">; let mode = 0, M = 0 in def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">; let imod = 0, iflags = 0, M = 1 in def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">; } // Preload signals the memory system of possible future data/instruction access. multiclass APreLoad read, bits<1> data, string opc> { def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm, IIC_Preload, !strconcat(opc, "\t$addr"), [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>, Sched<[WritePreLd]> { bits<4> Rt; bits<17> addr; let Inst{31-26} = 0b111101; let Inst{25} = 0; // 0 for immediate form let Inst{24} = data; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{22} = read; let Inst{21-20} = 0b01; let Inst{19-16} = addr{16-13}; // Rn let Inst{15-12} = 0b1111; let Inst{11-0} = addr{11-0}; // imm12 } def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload, !strconcat(opc, "\t$shift"), [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>, Sched<[WritePreLd]> { bits<17> shift; let Inst{31-26} = 0b111101; let Inst{25} = 1; // 1 for register form let Inst{24} = data; let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{22} = read; let Inst{21-20} = 0b01; let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = 0b1111; let Inst{11-0} = shift{11-0}; let Inst{4} = 0; } } defm PLD : APreLoad<1, 1, "pld">, Requires<[IsARM]>; defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>; defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>; def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary, "setend\t$end", []>, Requires<[IsARM]>, Deprecated { bits<1> end; let Inst{31-10} = 0b1111000100000001000000; let Inst{9} = end; let Inst{8-0} = 0; } def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt", [(int_arm_dbg imm0_15:$opt)]>, Requires<[IsARM, HasV7]> { bits<4> opt; let Inst{27-4} = 0b001100100000111100001111; let Inst{3-0} = opt; } // A8.8.247 UDF - Undefined (Encoding A1) def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary, "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> { bits<16> imm16; let Inst{31-28} = 0b1110; // AL let Inst{27-25} = 0b011; let Inst{24-20} = 0b11111; let Inst{19-8} = imm16{15-4}; let Inst{7-4} = 0b1111; let Inst{3-0} = imm16{3-0}; } /* * A5.4 Permanently UNDEFINED instructions. * * For most targets use UDF #65006, for which the OS will generate SIGTRAP. * Other UDF encodings generate SIGILL. * * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb. * Encoding A1: * 1110 0111 1111 iiii iiii iiii 1111 iiii * Encoding T1: * 1101 1110 iiii iiii * It uses the following encoding: * 1110 0111 1111 1110 1101 1110 1111 0000 * - In ARM: UDF #60896; * - In Thumb: UDF #254 followed by a branch-to-self. */ let isBarrier = 1, isTerminator = 1 in def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary, "trap", [(trap)]>, Requires<[IsARM,UseNaClTrap]> { let Inst = 0xe7fedef0; } let isBarrier = 1, isTerminator = 1 in def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, "trap", [(trap)]>, Requires<[IsARM,DontUseNaClTrap]> { let Inst = 0xe7ffdefe; } def : Pat<(debugtrap), (BKPT 0)>, Requires<[IsARM, HasV5T]>; def : Pat<(debugtrap), (UDF 254)>, Requires<[IsARM, NoV5T]>; // Address computation and loads and stores in PIC mode. let isNotDuplicable = 1 in { def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), 4, IIC_iALUr, [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>, Sched<[WriteALU, ReadALU]>; let AddedComplexity = 10 in { def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), 4, IIC_iLoad_r, [(set GPR:$dst, (load addrmodepc:$addr))]>; def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), 4, IIC_iLoad_bh_r, [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>; def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), 4, IIC_iLoad_bh_r, [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>; def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), 4, IIC_iLoad_bh_r, [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>; def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), 4, IIC_iLoad_bh_r, [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>; } let AddedComplexity = 10 in { def PICSTR : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), 4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>; def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), 4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src, addrmodepc:$addr)]>; def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), 4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; } } // isNotDuplicable = 1 // LEApcrel - Load a pc-relative address into a register without offending the // assembler. let hasSideEffects = 0, isReMaterializable = 1 in // The 'adr' mnemonic encodes differently if the label is before or after // the instruction. The {24-21} opcode bits are set by the fixup, as we don't // know until then which form of the instruction will be used. def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>, Sched<[WriteALU, ReadALU]> { bits<4> Rd; bits<14> label; let Inst{27-25} = 0b001; let Inst{24} = 0; let Inst{23-22} = label{13-12}; let Inst{21} = 0; let Inst{20} = 0; let Inst{19-16} = 0b1111; let Inst{15-12} = Rd; let Inst{11-0} = label{11-0}; } let hasSideEffects = 1 in { def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; } //===----------------------------------------------------------------------===// // Control Flow Instructions. // let isReturn = 1, isTerminator = 1, isBarrier = 1 in { // ARMV4T and above def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, "bx", "\tlr", [(ARMretflag)]>, Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { let Inst{27-0} = 0b0001001011111111111100011110; } // ARMV4 only def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, "mov", "\tpc, lr", [(ARMretflag)]>, Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> { let Inst{27-0} = 0b0001101000001111000000001110; } // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets // the user-space one). def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p), 4, IIC_Br, [(ARMintretflag imm:$offset)]>; } // Indirect branches let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // ARMV4T and above def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", [(brind GPR:$dst)]>, Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { bits<4> dst; let Inst{31-4} = 0b1110000100101111111111110001; let Inst{3-0} = dst; } def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx", "\t$dst", [/* pattern left blank */]>, Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { bits<4> dst; let Inst{27-4} = 0b000100101111111111110001; let Inst{3-0} = dst; } } // SP is marked as a use to prevent stack-pointer assignments that appear // immediately before calls from potentially appearing dead. let isCall = 1, // FIXME: Do we really need a non-predicated version? If so, it should // at least be a pseudo instruction expanding to the predicated version // at MC lowering time. Defs = [LR], Uses = [SP] in { def BL : ABXI<0b1011, (outs), (ins arm_bl_target:$func), IIC_Br, "bl\t$func", [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBrL]> { let Inst{31-28} = 0b1110; bits<24> func; let Inst{23-0} = func; let DecoderMethod = "DecodeBranchImmInstruction"; } def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func), IIC_Br, "bl", "\t$func", [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBrL]> { bits<24> func; let Inst{23-0} = func; let DecoderMethod = "DecodeBranchImmInstruction"; } // ARMv5T and above def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { bits<4> func; let Inst{31-4} = 0b1110000100101111111111110011; let Inst{3-0} = func; } def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx", "\t$func", [(ARMcall_pred GPR:$func)]>, Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { bits<4> func; let Inst{27-4} = 0b000100101111111111110011; let Inst{3-0} = func; } // ARMv4T // Note: Restrict $func to the tGPR regclass to prevent it being in LR. def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>; // ARMv4 def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>; // mov lr, pc; b if callee is marked noreturn to avoid confusing the // return stack predictor. def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func), 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBr]>; - - // push lr before the call - def BL_PUSHLR : ARMPseudoInst<(outs), (ins GPRlr:$ra, arm_bl_target:$func), - 4, IIC_Br, - []>, - Requires<[IsARM]>, Sched<[WriteBr]>; } let isBranch = 1, isTerminator = 1 in { // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target), IIC_Br, "b", "\t$target", [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>, Sched<[WriteBr]> { bits<24> target; let Inst{23-0} = target; let DecoderMethod = "DecodeBranchImmInstruction"; } let isBarrier = 1 in { // B is "predicable" since it's just a Bcc with an 'always' condition. let isPredicable = 1 in // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly // should be sufficient. // FIXME: Is B really a Barrier? That doesn't seem right. def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br, [(br bb:$target)], (Bcc arm_br_target:$target, (ops 14, zero_reg))>, Sched<[WriteBr]>; let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { def BR_JTr : ARMPseudoInst<(outs), (ins GPR:$target, i32imm:$jt), 0, IIC_Br, [(ARMbrjt GPR:$target, tjumptable:$jt)]>, Sched<[WriteBr]>; def BR_JTm_i12 : ARMPseudoInst<(outs), (ins addrmode_imm12:$target, i32imm:$jt), 0, IIC_Br, [(ARMbrjt (i32 (load addrmode_imm12:$target)), tjumptable:$jt)]>, Sched<[WriteBrTbl]>; def BR_JTm_rs : ARMPseudoInst<(outs), (ins ldst_so_reg:$target, i32imm:$jt), 0, IIC_Br, [(ARMbrjt (i32 (load ldst_so_reg:$target)), tjumptable:$jt)]>, Sched<[WriteBrTbl]>; def BR_JTadd : ARMPseudoInst<(outs), (ins GPR:$target, GPR:$idx, i32imm:$jt), 0, IIC_Br, [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt)]>, Sched<[WriteBrTbl]>; } // isNotDuplicable = 1, isIndirectBranch = 1 } // isBarrier = 1 } // BLX (immediate) def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary, "blx\t$target", []>, Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { let Inst{31-25} = 0b1111101; bits<25> target; let Inst{23-0} = target{24-1}; let Inst{24} = target{0}; let isCall = 1; } // Branch and Exchange Jazelle def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", [/* pattern left blank */]>, Sched<[WriteBr]> { bits<4> func; let Inst{23-20} = 0b0010; let Inst{19-8} = 0xfff; let Inst{7-4} = 0b0010; let Inst{3-0} = func; let isBranch = 1; } // Tail calls. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>, Sched<[WriteBr]>; def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>, Sched<[WriteBr]>; def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst), 4, IIC_Br, [], (Bcc arm_br_target:$dst, (ops 14, zero_reg))>, Requires<[IsARM]>, Sched<[WriteBr]>; def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst), 4, IIC_Br, [], (BX GPR:$dst)>, Sched<[WriteBr]>, Requires<[IsARM, HasV4T]>; } // Secure Monitor Call is a system instruction. def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", []>, Requires<[IsARM, HasTrustZone]> { bits<4> opt; let Inst{23-4} = 0b01100000000000000111; let Inst{3-0} = opt; } def : MnemonicAlias<"smi", "smc">; // Supervisor Call (Software Interrupt) let isCall = 1, Uses = [SP] in { def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>, Sched<[WriteBr]> { bits<24> svc; let Inst{23-0} = svc; } } // Store Return State class SRSI : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm, NoItinerary, asm, "", []> { bits<5> mode; let Inst{31-28} = 0b1111; let Inst{27-25} = 0b100; let Inst{22} = 1; let Inst{21} = wb; let Inst{20} = 0; let Inst{19-16} = 0b1101; // SP let Inst{15-5} = 0b00000101000; let Inst{4-0} = mode; } def SRSDA : SRSI<0, "srsda\tsp, $mode"> { let Inst{24-23} = 0; } def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> { let Inst{24-23} = 0; } def SRSDB : SRSI<0, "srsdb\tsp, $mode"> { let Inst{24-23} = 0b10; } def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> { let Inst{24-23} = 0b10; } def SRSIA : SRSI<0, "srsia\tsp, $mode"> { let Inst{24-23} = 0b01; } def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> { let Inst{24-23} = 0b01; } def SRSIB : SRSI<0, "srsib\tsp, $mode"> { let Inst{24-23} = 0b11; } def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> { let Inst{24-23} = 0b11; } def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>; def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>; def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>; def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>; def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>; def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>; def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>; def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>; // Return From Exception class RFEI : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm, NoItinerary, asm, "", []> { bits<4> Rn; let Inst{31-28} = 0b1111; let Inst{27-25} = 0b100; let Inst{22} = 0; let Inst{21} = wb; let Inst{20} = 1; let Inst{19-16} = Rn; let Inst{15-0} = 0xa00; } def RFEDA : RFEI<0, "rfeda\t$Rn"> { let Inst{24-23} = 0; } def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> { let Inst{24-23} = 0; } def RFEDB : RFEI<0, "rfedb\t$Rn"> { let Inst{24-23} = 0b10; } def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> { let Inst{24-23} = 0b10; } def RFEIA : RFEI<0, "rfeia\t$Rn"> { let Inst{24-23} = 0b01; } def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> { let Inst{24-23} = 0b01; } def RFEIB : RFEI<0, "rfeib\t$Rn"> { let Inst{24-23} = 0b11; } def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> { let Inst{24-23} = 0b11; } // Hypervisor Call is a system instruction let isCall = 1 in { def HVC : AInoP< (outs), (ins imm0_65535:$imm), BrFrm, NoItinerary, "hvc", "\t$imm", []>, Requires<[IsARM, HasVirtualization]> { bits<16> imm; // Even though HVC isn't predicable, it's encoding includes a condition field. // The instruction is undefined if the condition field is 0xf otherwise it is // unpredictable if it isn't condition AL (0xe). let Inst{31-28} = 0b1110; let Unpredictable{31-28} = 0b1111; let Inst{27-24} = 0b0001; let Inst{23-20} = 0b0100; let Inst{19-8} = imm{15-4}; let Inst{7-4} = 0b0111; let Inst{3-0} = imm{3-0}; } } // Return from exception in Hypervisor mode. let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>, Requires<[IsARM, HasVirtualization]> { let Inst{23-0} = 0b011000000000000001101110; } //===----------------------------------------------------------------------===// // Load / Store Instructions. // // Load defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>; defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si, zextloadi8>; defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>; defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si, truncstorei8>; // Special LDR for loads from non-pc-relative constpools. let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0, isReMaterializable = 1, isCodeGenOnly = 1 in def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr), AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr", []> { bits<4> Rt; bits<17> addr; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = 0b1111; let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } // Loads with zero extension def LDRH : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr", [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>; // Loads with sign extension def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr", [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>; def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>; let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { // Load doubleword def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr), LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>, Requires<[IsARM, HasV5TE]>; } def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, "lda", "\t$Rt, $addr", []>; def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, "ldab", "\t$Rt, $addr", []>; def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, "ldah", "\t$Rt, $addr", []>; // Indexed loads multiclass AI2_ldridx { def _PRE_IMM : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), (ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { bits<17> addr; let Inst{25} = 0; let Inst{23} = addr{12}; let Inst{19-16} = addr{16-13}; let Inst{11-0} = addr{11-0}; let DecoderMethod = "DecodeLDRPreImm"; } def _PRE_REG : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), (ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { bits<17> addr; let Inst{25} = 1; let Inst{23} = addr{12}; let Inst{19-16} = addr{16-13}; let Inst{11-0} = addr{11-0}; let Inst{4} = 0; let DecoderMethod = "DecodeLDRPreReg"; } def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$addr, am2offset_reg:$offset), IndexModePost, LdFrm, iir, opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 1; let Inst{23} = offset{12}; let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let Inst{4} = 0; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$addr, am2offset_imm:$offset), IndexModePost, LdFrm, iii, opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 0; let Inst{23} = offset{12}; let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } } let mayLoad = 1, hasSideEffects = 0 in { // FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or // IIC_iLoad_siu depending on whether it the offset register is shifted. defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>; defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>; } multiclass AI3_ldridx op, string opc, InstrItinClass itin> { def _PRE : AI3ldstidx { bits<14> addr; let Inst{23} = addr{8}; // U bit let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr{12-9}; // Rn let Inst{11-8} = addr{7-4}; // imm7_4/zero let Inst{3-0} = addr{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } def _POST : AI3ldstidx { bits<10> offset; bits<4> addr; let Inst{23} = offset{8}; // U bit let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr; let Inst{11-8} = offset{7-4}; // imm7_4/zero let Inst{3-0} = offset{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } } let mayLoad = 1, hasSideEffects = 0 in { defm LDRH : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>; defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>; defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>; let hasExtraDefRegAllocReq = 1 in { def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), (ins addrmode3_pre:$addr), IndexModePre, LdMiscFrm, IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $Rn_wb", []> { bits<14> addr; let Inst{23} = addr{8}; // U bit let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr{12-9}; // Rn let Inst{11-8} = addr{7-4}; // imm7_4/zero let Inst{3-0} = addr{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), (ins addr_offset_none:$addr, am3offset:$offset), IndexModePost, LdMiscFrm, IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr, $offset", "$addr.base = $Rn_wb", []> { bits<10> offset; bits<4> addr; let Inst{23} = offset{8}; // U bit let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr; let Inst{11-8} = offset{7-4}; // imm7_4/zero let Inst{3-0} = offset{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } } // hasExtraDefRegAllocReq = 1 } // mayLoad = 1, hasSideEffects = 0 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT. let mayLoad = 1, hasSideEffects = 0 in { def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$addr, am2offset_reg:$offset), IndexModePost, LdFrm, IIC_iLoad_ru, "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 1; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-5} = offset{11-5}; let Inst{4} = 0; let Inst{3-0} = offset{3-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def LDRT_POST_IMM : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$addr, am2offset_imm:$offset), IndexModePost, LdFrm, IIC_iLoad_ru, "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 0; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$addr, am2offset_reg:$offset), IndexModePost, LdFrm, IIC_iLoad_bh_ru, "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 1; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-5} = offset{11-5}; let Inst{4} = 0; let Inst{3-0} = offset{3-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def LDRBT_POST_IMM : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$addr, am2offset_imm:$offset), IndexModePost, LdFrm, IIC_iLoad_bh_ru, "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 0; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } multiclass AI3ldrT op, string opc> { def i : AI3ldstidxT { bits<9> offset; let Inst{23} = offset{8}; let Inst{22} = 1; let Inst{11-8} = offset{7-4}; let Inst{3-0} = offset{3-0}; } def r : AI3ldstidxT { bits<5> Rm; let Inst{23} = Rm{4}; let Inst{22} = 0; let Inst{11-8} = 0; let Unpredictable{11-8} = 0b1111; let Inst{3-0} = Rm{3-0}; let DecoderMethod = "DecodeLDR"; } } defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">; defm LDRHT : AI3ldrT<0b1011, "ldrht">; defm LDRSHT : AI3ldrT<0b1111, "ldrsht">; } def LDRT_POST : ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q), (outs GPR:$Rt)>; def LDRBT_POST : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q), (outs GPR:$Rt)>; // Pseudo instruction ldr Rt, =immediate def LDRConstPool : ARMAsmPseudo<"ldr${q} $Rt, $immediate", (ins const_pool_asm_imm:$immediate, pred:$q), (outs GPR:$Rt)>; // Store // Stores with truncate def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm, IIC_iStore_bh_r, "strh", "\t$Rt, $addr", [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>; // Store doubleword let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr), StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>, Requires<[IsARM, HasV5TE]> { let Inst{21} = 0; } } // Indexed stores multiclass AI2_stridx { def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre, StFrm, iii, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { bits<17> addr; let Inst{25} = 0; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = addr{16-13}; // Rn let Inst{11-0} = addr{11-0}; // imm12 let DecoderMethod = "DecodeSTRPreImm"; } def _PRE_REG : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, ldst_so_reg:$addr), IndexModePre, StFrm, iir, opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { bits<17> addr; let Inst{25} = 1; let Inst{23} = addr{12}; // U (add = ('U' == 1)) let Inst{19-16} = addr{16-13}; // Rn let Inst{11-0} = addr{11-0}; let Inst{4} = 0; // Inst{4} = 0 let DecoderMethod = "DecodeSTRPreReg"; } def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), IndexModePost, StFrm, iir, opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 1; let Inst{23} = offset{12}; let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let Inst{4} = 0; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), IndexModePost, StFrm, iii, opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 0; let Inst{23} = offset{12}; let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } } let mayStore = 1, hasSideEffects = 0 in { // FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or // IIC_iStore_siu depending on whether it the offset register is shifted. defm STR : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>; defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>; } def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), (STR_POST_REG GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset)>; def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset)>; def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset)>; def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset)>; // Pseudo-instructions for pattern matching the pre-indexed stores. We can't // put the patterns on the instruction definitions directly as ISel wants // the address base and offset to be separate operands, not a single // complex operand like we represent the instructions themselves. The // pseudos map between the two. let usesCustomInserter = 1, Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in { def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p), 4, IIC_iStore_ru, [(set GPR:$Rn_wb, (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>; def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p), 4, IIC_iStore_ru, [(set GPR:$Rn_wb, (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>; def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p), 4, IIC_iStore_ru, [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>; def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p), 4, IIC_iStore_ru, [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>; def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p), 4, IIC_iStore_ru, [(set GPR:$Rn_wb, (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; } def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre, StMiscFrm, IIC_iStore_bh_ru, "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { bits<14> addr; let Inst{23} = addr{8}; // U bit let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr{12-9}; // Rn let Inst{11-8} = addr{7-4}; // imm7_4/zero let Inst{3-0} = addr{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset), IndexModePost, StMiscFrm, IIC_iStore_bh_ru, "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt, addr_offset_none:$addr, am3offset:$offset))]> { bits<10> offset; bits<4> addr; let Inst{23} = offset{8}; // U bit let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr; let Inst{11-8} = offset{7-4}; // imm7_4/zero let Inst{3-0} = offset{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr), IndexModePre, StMiscFrm, IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", "$addr.base = $Rn_wb", []> { bits<14> addr; let Inst{23} = addr{8}; // U bit let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr{12-9}; // Rn let Inst{11-8} = addr{7-4}; // imm7_4/zero let Inst{3-0} = addr{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr, am3offset:$offset), IndexModePost, StMiscFrm, IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr, $offset", "$addr.base = $Rn_wb", []> { bits<10> offset; bits<4> addr; let Inst{23} = offset{8}; // U bit let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm let Inst{19-16} = addr; let Inst{11-8} = offset{7-4}; // imm7_4/zero let Inst{3-0} = offset{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 // STRT, STRBT, and STRHT def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), IndexModePost, StFrm, IIC_iStore_bh_ru, "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 1; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-5} = offset{11-5}; let Inst{4} = 0; let Inst{3-0} = offset{3-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), IndexModePost, StFrm, IIC_iStore_bh_ru, "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 0; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def STRBT_POST : ARMAsmPseudo<"strbt${q} $Rt, $addr", (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>; let mayStore = 1, hasSideEffects = 0 in { def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), IndexModePost, StFrm, IIC_iStore_ru, "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 1; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-5} = offset{11-5}; let Inst{4} = 0; let Inst{3-0} = offset{3-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), IndexModePost, StFrm, IIC_iStore_ru, "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { // {12} isAdd // {11-0} imm12/Rm bits<14> offset; bits<4> addr; let Inst{25} = 0; let Inst{23} = offset{12}; let Inst{21} = 1; // overwrite let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } } def STRT_POST : ARMAsmPseudo<"strt${q} $Rt, $addr", (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>; multiclass AI3strT op, string opc> { def i : AI3ldstidxT { bits<9> offset; let Inst{23} = offset{8}; let Inst{22} = 1; let Inst{11-8} = offset{7-4}; let Inst{3-0} = offset{3-0}; } def r : AI3ldstidxT { bits<5> Rm; let Inst{23} = Rm{4}; let Inst{22} = 0; let Inst{11-8} = 0; let Inst{3-0} = Rm{3-0}; } } defm STRHT : AI3strT<0b1011, "strht">; def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr), NoItinerary, "stl", "\t$Rt, $addr", []>; def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr), NoItinerary, "stlb", "\t$Rt, $addr", []>; def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr), NoItinerary, "stlh", "\t$Rt, $addr", []>; //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // multiclass arm_ldst_mult { // IA is the default, so no need for an explicit suffix on the // mnemonic here. Without it is the canonical spelling. def IA : AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeNone, f, itin, !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> { let Inst{24-23} = 0b01; // Increment After let Inst{22} = P_bit; let Inst{21} = 0; // No writeback let Inst{20} = L_bit; } def IA_UPD : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeUpd, f, itin_upd, !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { let Inst{24-23} = 0b01; // Increment After let Inst{22} = P_bit; let Inst{21} = 1; // Writeback let Inst{20} = L_bit; let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; } def DA : AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeNone, f, itin, !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> { let Inst{24-23} = 0b00; // Decrement After let Inst{22} = P_bit; let Inst{21} = 0; // No writeback let Inst{20} = L_bit; } def DA_UPD : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeUpd, f, itin_upd, !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { let Inst{24-23} = 0b00; // Decrement After let Inst{22} = P_bit; let Inst{21} = 1; // Writeback let Inst{20} = L_bit; let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; } def DB : AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeNone, f, itin, !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> { let Inst{24-23} = 0b10; // Decrement Before let Inst{22} = P_bit; let Inst{21} = 0; // No writeback let Inst{20} = L_bit; } def DB_UPD : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeUpd, f, itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { let Inst{24-23} = 0b10; // Decrement Before let Inst{22} = P_bit; let Inst{21} = 1; // Writeback let Inst{20} = L_bit; let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; } def IB : AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeNone, f, itin, !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> { let Inst{24-23} = 0b11; // Increment Before let Inst{22} = P_bit; let Inst{21} = 0; // No writeback let Inst{20} = L_bit; } def IB_UPD : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeUpd, f, itin_upd, !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { let Inst{24-23} = 0b11; // Increment Before let Inst{22} = P_bit; let Inst{21} = 1; // Writeback let Inst{20} = L_bit; let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; } } let hasSideEffects = 0 in { let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">; let mayStore = 1, hasExtraSrcRegAllocReq = 1 in defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m, IIC_iStore_mu>, ComplexDeprecationPredicate<"ARMStore">; } // hasSideEffects // FIXME: remove when we have a way to marking a MI with these properties. // FIXME: Should pc be an implicit operand like PICADD, etc? let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), 4, IIC_iLoad_mBr, [], (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>, RegConstraint<"$Rn = $wb">; let mayLoad = 1, hasExtraDefRegAllocReq = 1 in defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>; let mayStore = 1, hasExtraSrcRegAllocReq = 1 in defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m, IIC_iStore_mu>; //===----------------------------------------------------------------------===// // Move Instructions. // let hasSideEffects = 0, isMoveReg = 1 in def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<4> Rm; let Inst{19-16} = 0b0000; let Inst{11-4} = 0b00000000; let Inst{25} = 0; let Inst{3-0} = Rm; let Inst{15-12} = Rd; } // A version for the smaller set of tail call registers. let hasSideEffects = 0 in def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm, IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<4> Rm; let Inst{11-4} = 0b00000000; let Inst{25} = 0; let Inst{3-0} = Rm; let Inst{15-12} = Rd; } def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src), DPSoRegRegFrm, IIC_iMOVsr, "mov", "\t$Rd, $src", [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> src; let Inst{15-12} = Rd; let Inst{19-16} = 0b0000; let Inst{11-8} = src{11-8}; let Inst{7} = 0; let Inst{6-5} = src{6-5}; let Inst{4} = 1; let Inst{3-0} = src{3-0}; let Inst{25} = 0; } def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src), DPSoRegImmFrm, IIC_iMOVsr, "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> src; let Inst{15-12} = Rd; let Inst{19-16} = 0b0000; let Inst{11-5} = src{11-5}; let Inst{4} = 0; let Inst{3-0} = src{3-0}; let Inst{25} = 0; } let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMOVi, "mov", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm:$imm)]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> imm; let Inst{25} = 1; let Inst{15-12} = Rd; let Inst{19-16} = 0b0000; let Inst{11-0} = imm; } let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm), DPFrm, IIC_iMOVi, "movw", "\t$Rd, $imm", [(set GPR:$Rd, imm0_65535:$imm)]>, Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<16> imm; let Inst{15-12} = Rd; let Inst{11-0} = imm{11-0}; let Inst{19-16} = imm{15-12}; let Inst{20} = 0; let Inst{25} = 1; let DecoderMethod = "DecodeArmMOVTWInstruction"; } def : InstAlias<"mov${p} $Rd, $imm", (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>, Requires<[IsARM, HasV6T2]>; def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, Sched<[WriteALU]>; let Constraints = "$src = $Rd" in { def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), (ins GPR:$src, imm0_65535_expr:$imm), DPFrm, IIC_iMOVi, "movt", "\t$Rd, $imm", [(set GPRnopc:$Rd, (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>, UnaryDP, Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> { bits<4> Rd; bits<16> imm; let Inst{15-12} = Rd; let Inst{11-0} = imm{11-0}; let Inst{19-16} = imm{15-12}; let Inst{20} = 0; let Inst{25} = 1; let DecoderMethod = "DecodeArmMOVTWInstruction"; } def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, Sched<[WriteALU]>; } // Constraints def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>, Requires<[IsARM, HasV6T2]>; let Uses = [CPSR] in def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi, [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP, Requires<[IsARM]>, Sched<[WriteALU]>; // These aren't really mov instructions, but we have to define them this way // due to flag operands. let Defs = [CPSR] in { def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP, Sched<[WriteALU]>, Requires<[IsARM]>; def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP, Sched<[WriteALU]>, Requires<[IsARM]>; } //===----------------------------------------------------------------------===// // Extend Instructions. // // Sign extenders def SXTB : AI_ext_rrot<0b01101010, "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; def SXTH : AI_ext_rrot<0b01101011, "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; def SXTAB : AI_exta_rrot<0b01101010, "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; def SXTAH : AI_exta_rrot<0b01101011, "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, rot_imm:$rot), i8)), (SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot), i16)), (SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">; def : ARMV6Pat<(int_arm_sxtb16 GPR:$Src), (SXTB16 GPR:$Src, 0)>; def : ARMV6Pat<(int_arm_sxtb16 (rotr GPR:$Src, rot_imm:$rot)), (SXTB16 GPR:$Src, rot_imm:$rot)>; def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">; def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, GPR:$RHS), (SXTAB16 GPR:$LHS, GPR:$RHS, 0)>; def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)), (SXTAB16 GPR:$LHS, GPR:$RHS, rot_imm:$rot)>; // Zero extenders let AddedComplexity = 16 in { def UXTB : AI_ext_rrot<0b01101110, "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; def UXTH : AI_ext_rrot<0b01101111, "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; def UXTB16 : AI_ext_rrot<0b01101100, "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; // FIXME: This pattern incorrectly assumes the shl operator is a rotate. // The transformation should probably be done as a combiner action // instead so we can include a check for masking back in the upper // eight bits of the source into the lower eight bits of the result. //def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF), // (UXTB16r_rot GPR:$Src, 3)>; def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), (UXTB16 GPR:$Src, 1)>; def : ARMV6Pat<(int_arm_uxtb16 GPR:$Src), (UXTB16 GPR:$Src, 0)>; def : ARMV6Pat<(int_arm_uxtb16 (rotr GPR:$Src, rot_imm:$rot)), (UXTB16 GPR:$Src, rot_imm:$rot)>; def UXTAB : AI_exta_rrot<0b01101110, "uxtab", BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; def UXTAH : AI_exta_rrot<0b01101111, "uxtah", BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot), 0xFF)), (UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)), (UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; } // This isn't safe in general, the add is two 16-bit units, not a 32-bit add. def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">; def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, GPR:$RHS), (UXTAB16 GPR:$LHS, GPR:$RHS, 0)>; def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)), (UXTAB16 GPR:$LHS, GPR:$RHS, rot_imm:$rot)>; def SBFX : I<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>, Requires<[IsARM, HasV6T2]> { bits<4> Rd; bits<4> Rn; bits<5> lsb; bits<5> width; let Inst{27-21} = 0b0111101; let Inst{6-4} = 0b101; let Inst{20-16} = width; let Inst{15-12} = Rd; let Inst{11-7} = lsb; let Inst{3-0} = Rn; } def UBFX : I<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>, Requires<[IsARM, HasV6T2]> { bits<4> Rd; bits<4> Rn; bits<5> lsb; bits<5> width; let Inst{27-21} = 0b0111111; let Inst{6-4} = 0b101; let Inst{20-16} = width; let Inst{15-12} = Rd; let Inst{11-7} = lsb; let Inst{3-0} = Rn; } //===----------------------------------------------------------------------===// // Arithmetic Instructions. // let isAdd = 1 in defm ADD : AsI1_bin_irs<0b0100, "add", IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>; defm SUB : AsI1_bin_irs<0b0010, "sub", IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>; // ADD and SUB with 's' bit set. // // Currently, ADDS/SUBS are pseudo opcodes that exist only in the // selection DAG. They are "lowered" to real ADD/SUB opcodes by // AdjustInstrPostInstrSelection where we determine whether or not to // set the "s" bit based on CPSR liveness. // // FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen // support for an optional CPSR definition that corresponds to the DAG // node's second value. We can then eliminate the implicit def of CPSR. let isAdd = 1 in defm ADDS : AsI1_bin_s_irs; defm SUBS : AsI1_bin_s_irs; def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>; def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>; def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift), (SUBSrsi $Rn, so_reg_imm:$shift)>; def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift), (SUBSrsr $Rn, so_reg_reg:$shift)>; let isAdd = 1 in defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>; defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>; defm RSB : AsI1_rbin_irs<0b0011, "rsb", IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>; // FIXME: Eliminate them if we can write def : Pat patterns which defines // CPSR and the implicit def of CPSR is not needed. defm RSBS : AsI1_rbin_s_is; defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>; // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. // The assume-no-carry-in form uses the negation of the input since add/sub // assume opposite meanings of the carry flag (i.e., carry == !borrow). // See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory // details. def : ARMPat<(add GPR:$src, mod_imm_neg:$imm), (SUBri GPR:$src, mod_imm_neg:$imm)>; def : ARMPat<(ARMaddc GPR:$src, mod_imm_neg:$imm), (SUBSri GPR:$src, mod_imm_neg:$imm)>; def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm), (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>, Requires<[IsARM, HasV6T2]>; def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm), (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>, Requires<[IsARM, HasV6T2]>; // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. def : ARMPat<(ARMadde GPR:$src, mod_imm_not:$imm, CPSR), (SBCri GPR:$src, mod_imm_not:$imm)>; def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR), (SBCrr GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>, Requires<[IsARM, HasV6T2]>; // Note: These are implemented in C++ code, because they have to generate // ADD/SUBrs instructions, which use a complex pattern that a xform function // cannot produce. // (mul X, 2^n+1) -> (add (X << n), X) // (mul X, 2^n-1) -> (rsb X, (X << n)) // ARM Arithmetic Instruction // GPR:$dst = GPR:$a op GPR:$b class AAI op27_20, bits<8> op11_4, string opc, list pattern = [], dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm), string asm = "\t$Rd, $Rn, $Rm"> : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rn; bits<4> Rd; bits<4> Rm; let Inst{27-20} = op27_20; let Inst{11-4} = op11_4; let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{3-0} = Rm; let Unpredictable{11-8} = 0b1111; } // Wrappers around the AAI class class AAIRevOpr op27_20, bits<8> op11_4, string opc, list pattern = []> : AAI; class AAIIntrinsic op27_20, bits<8> op11_4, string opc, Intrinsic intrinsic> : AAI; // Saturating add/subtract let hasSideEffects = 1 in { def QADD8 : AAIIntrinsic<0b01100010, 0b11111001, "qadd8", int_arm_qadd8>; def QADD16 : AAIIntrinsic<0b01100010, 0b11110001, "qadd16", int_arm_qadd16>; def QSUB16 : AAIIntrinsic<0b01100010, 0b11110111, "qsub16", int_arm_qsub16>; def QSUB8 : AAIIntrinsic<0b01100010, 0b11111111, "qsub8", int_arm_qsub8>; def QDADD : AAIRevOpr<0b00010100, 0b00000101, "qdadd", [(set GPRnopc:$Rd, (int_arm_qadd (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rm), GPRnopc:$Rn))]>; def QDSUB : AAIRevOpr<0b00010110, 0b00000101, "qdsub", [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>; def QSUB : AAIRevOpr<0b00010010, 0b00000101, "qsub", [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))]>; let DecoderMethod = "DecodeQADDInstruction" in def QADD : AAIRevOpr<0b00010000, 0b00000101, "qadd", [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>; } def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>; def UQADD8 : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>; def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>; def UQSUB8 : AAIIntrinsic<0b01100110, 0b11111111, "uqsub8", int_arm_uqsub8>; def QASX : AAIIntrinsic<0b01100010, 0b11110011, "qasx", int_arm_qasx>; def QSAX : AAIIntrinsic<0b01100010, 0b11110101, "qsax", int_arm_qsax>; def UQASX : AAIIntrinsic<0b01100110, 0b11110011, "uqasx", int_arm_uqasx>; def UQSAX : AAIIntrinsic<0b01100110, 0b11110101, "uqsax", int_arm_uqsax>; // Signed/Unsigned add/subtract def SASX : AAIIntrinsic<0b01100001, 0b11110011, "sasx", int_arm_sasx>; def SADD16 : AAIIntrinsic<0b01100001, 0b11110001, "sadd16", int_arm_sadd16>; def SADD8 : AAIIntrinsic<0b01100001, 0b11111001, "sadd8", int_arm_sadd8>; def SSAX : AAIIntrinsic<0b01100001, 0b11110101, "ssax", int_arm_ssax>; def SSUB16 : AAIIntrinsic<0b01100001, 0b11110111, "ssub16", int_arm_ssub16>; def SSUB8 : AAIIntrinsic<0b01100001, 0b11111111, "ssub8", int_arm_ssub8>; def UASX : AAIIntrinsic<0b01100101, 0b11110011, "uasx", int_arm_uasx>; def UADD16 : AAIIntrinsic<0b01100101, 0b11110001, "uadd16", int_arm_uadd16>; def UADD8 : AAIIntrinsic<0b01100101, 0b11111001, "uadd8", int_arm_uadd8>; def USAX : AAIIntrinsic<0b01100101, 0b11110101, "usax", int_arm_usax>; def USUB16 : AAIIntrinsic<0b01100101, 0b11110111, "usub16", int_arm_usub16>; def USUB8 : AAIIntrinsic<0b01100101, 0b11111111, "usub8", int_arm_usub8>; // Signed/Unsigned halving add/subtract def SHASX : AAIIntrinsic<0b01100011, 0b11110011, "shasx", int_arm_shasx>; def SHADD16 : AAIIntrinsic<0b01100011, 0b11110001, "shadd16", int_arm_shadd16>; def SHADD8 : AAIIntrinsic<0b01100011, 0b11111001, "shadd8", int_arm_shadd8>; def SHSAX : AAIIntrinsic<0b01100011, 0b11110101, "shsax", int_arm_shsax>; def SHSUB16 : AAIIntrinsic<0b01100011, 0b11110111, "shsub16", int_arm_shsub16>; def SHSUB8 : AAIIntrinsic<0b01100011, 0b11111111, "shsub8", int_arm_shsub8>; def UHASX : AAIIntrinsic<0b01100111, 0b11110011, "uhasx", int_arm_uhasx>; def UHADD16 : AAIIntrinsic<0b01100111, 0b11110001, "uhadd16", int_arm_uhadd16>; def UHADD8 : AAIIntrinsic<0b01100111, 0b11111001, "uhadd8", int_arm_uhadd8>; def UHSAX : AAIIntrinsic<0b01100111, 0b11110101, "uhsax", int_arm_uhsax>; def UHSUB16 : AAIIntrinsic<0b01100111, 0b11110111, "uhsub16", int_arm_uhsub16>; def UHSUB8 : AAIIntrinsic<0b01100111, 0b11111111, "uhsub8", int_arm_uhsub8>; // Unsigned Sum of Absolute Differences [and Accumulate]. def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), MulFrm /* for convenience */, NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (int_arm_usad8 GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; let Inst{27-20} = 0b01111000; let Inst{15-12} = 0b1111; let Inst{7-4} = 0b0001; let Inst{19-16} = Rd; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), MulFrm /* for convenience */, NoItinerary, "usada8", "\t$Rd, $Rn, $Rm, $Ra", [(set GPR:$Rd, (int_arm_usada8 GPR:$Rn, GPR:$Rm, GPR:$Ra))]>, Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{ bits<4> Rd; bits<4> Rn; bits<4> Rm; bits<4> Ra; let Inst{27-20} = 0b01111000; let Inst{7-4} = 0b0001; let Inst{19-16} = Rd; let Inst{15-12} = Ra; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } // Signed/Unsigned saturate def SSAT : AI<(outs GPRnopc:$Rd), (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>, Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<5> sat_imm; bits<4> Rn; bits<8> sh; let Inst{27-21} = 0b0110101; let Inst{5-4} = 0b01; let Inst{20-16} = sat_imm; let Inst{15-12} = Rd; let Inst{11-7} = sh{4-0}; let Inst{6} = sh{5}; let Inst{3-0} = Rn; } def SSAT16 : AI<(outs GPRnopc:$Rd), (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm, NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<4> sat_imm; bits<4> Rn; let Inst{27-20} = 0b01101010; let Inst{11-4} = 0b11110011; let Inst{15-12} = Rd; let Inst{19-16} = sat_imm; let Inst{3-0} = Rn; } def USAT : AI<(outs GPRnopc:$Rd), (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>, Requires<[IsARM,HasV6]> { bits<4> Rd; bits<5> sat_imm; bits<4> Rn; bits<8> sh; let Inst{27-21} = 0b0110111; let Inst{5-4} = 0b01; let Inst{15-12} = Rd; let Inst{11-7} = sh{4-0}; let Inst{6} = sh{5}; let Inst{20-16} = sat_imm; let Inst{3-0} = Rn; } def USAT16 : AI<(outs GPRnopc:$Rd), (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm, NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<4> sat_imm; bits<4> Rn; let Inst{27-20} = 0b01101110; let Inst{11-4} = 0b11110011; let Inst{15-12} = Rd; let Inst{19-16} = sat_imm; let Inst{3-0} = Rn; } def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos), (SSAT imm1_32:$pos, GPRnopc:$a, 0)>; def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), (USAT imm0_31:$pos, GPRnopc:$a, 0)>; def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm), (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos), (SSAT16 imm1_16:$pos, GPRnopc:$a)>; def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos), (USAT16 imm0_15:$pos, GPRnopc:$a)>; //===----------------------------------------------------------------------===// // Bitwise Instructions. // defm AND : AsI1_bin_irs<0b0000, "and", IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>; defm ORR : AsI1_bin_irs<0b1100, "orr", IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>; defm EOR : AsI1_bin_irs<0b0001, "eor", IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>; defm BIC : AsI1_bin_irs<0b1110, "bic", IIC_iBITi, IIC_iBITr, IIC_iBITsr, BinOpFrag<(and node:$LHS, (not node:$RHS))>>; // FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just // like in the actual instruction encoding. The complexity of mapping the mask // to the lsb/msb pair should be handled by ISel, not encapsulated in the // instruction description. def BFC : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm), AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, "bfc", "\t$Rd, $imm", "$src = $Rd", [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>, Requires<[IsARM, HasV6T2]> { bits<4> Rd; bits<10> imm; let Inst{27-21} = 0b0111110; let Inst{6-0} = 0b0011111; let Inst{15-12} = Rd; let Inst{11-7} = imm{4-0}; // lsb let Inst{20-16} = imm{9-5}; // msb } // A8.6.18 BFI - Bitfield insert (Encoding A1) def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm), AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd", [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm))]>, Requires<[IsARM, HasV6T2]> { bits<4> Rd; bits<4> Rn; bits<10> imm; let Inst{27-21} = 0b0111110; let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15 let Inst{15-12} = Rd; let Inst{11-7} = imm{4-0}; // lsb let Inst{20-16} = imm{9-5}; // width let Inst{3-0} = Rn; } def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr, "mvn", "\t$Rd, $Rm", [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<4> Rm; let Inst{25} = 0; let Inst{19-16} = 0b0000; let Inst{11-4} = 0b00000000; let Inst{15-12} = Rd; let Inst{3-0} = Rm; let Unpredictable{19-16} = 0b1111; } def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift), DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = 0b0000; let Inst{15-12} = Rd; let Inst{11-5} = shift{11-5}; let Inst{4} = 0; let Inst{3-0} = shift{3-0}; let Unpredictable{19-16} = 0b1111; } def MVNsr : AsI1<0b1111, (outs GPRnopc:$Rd), (ins so_reg_reg:$shift), DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", [(set GPRnopc:$Rd, (not so_reg_reg:$shift))]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> shift; let Inst{25} = 0; let Inst{19-16} = 0b0000; let Inst{15-12} = Rd; let Inst{11-8} = shift{11-8}; let Inst{7} = 0; let Inst{6-5} = shift{6-5}; let Inst{4} = 1; let Inst{3-0} = shift{3-0}; let Unpredictable{19-16} = 0b1111; } let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMVNi, "mvn", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> imm; let Inst{25} = 1; let Inst{19-16} = 0b0000; let Inst{15-12} = Rd; let Inst{11-0} = imm; } let AddedComplexity = 1 in def : ARMPat<(and GPR:$src, mod_imm_not:$imm), (BICri GPR:$src, mod_imm_not:$imm)>; //===----------------------------------------------------------------------===// // Multiply Instructions. // class AsMul1I32 opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AsMul1I { bits<4> Rd; bits<4> Rm; bits<4> Rn; let Inst{19-16} = Rd; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } class AsMul1I64 opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AsMul1I { bits<4> RdLo; bits<4> RdHi; bits<4> Rm; bits<4> Rn; let Inst{19-16} = RdHi; let Inst{15-12} = RdLo; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } class AsMla1I64 opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AsMul1I { bits<4> RdLo; bits<4> RdHi; bits<4> Rm; bits<4> Rn; let Inst{19-16} = RdHi; let Inst{15-12} = RdLo; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } // FIXME: The v5 pseudos are only necessary for the additional Constraint // property. Remove them when it's possible to add those properties // on an individual MachineInstr, not just an instruction description. let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in { def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> { let Inst{15-12} = 0b0000; let Unpredictable{15-12} = 0b1111; } let Constraints = "@earlyclobber $Rd" in def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 4, IIC_iMUL32, [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))], (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6, UseMulOps]>, Sched<[WriteMUL32, ReadMUL, ReadMUL]>; } def MLA : AsMul1I32<0b0000001, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra), IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>, Requires<[IsARM, HasV6, UseMulOps]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> { bits<4> Ra; let Inst{15-12} = Ra; } let Constraints = "@earlyclobber $Rd" in def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s), 4, IIC_iMAC32, [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))], (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra", [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>, Requires<[IsARM, HasV6T2, UseMulOps]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> { bits<4> Rd; bits<4> Rm; bits<4> Rn; bits<4> Ra; let Inst{19-16} = Rd; let Inst{15-12} = Ra; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } // Extra precision multiplies with low / high results let hasSideEffects = 0 in { let isCommutable = 1 in { def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, "smull", "\t$RdLo, $RdHi, $Rn, $Rm", [(set GPR:$RdLo, GPR:$RdHi, (smullohi GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>; def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, "umull", "\t$RdLo, $RdHi, $Rn, $Rm", [(set GPR:$RdLo, GPR:$RdHi, (umullohi GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL]>; let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 4, IIC_iMUL64, [(set GPR:$RdLo, GPR:$RdHi, (smullohi GPR:$Rn, GPR:$Rm))], (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>, Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>; def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 4, IIC_iMUL64, [(set GPR:$RdLo, GPR:$RdHi, (umullohi GPR:$Rn, GPR:$Rm))], (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>, Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>; } } // Multiply + accumulate def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> { bits<4> RdLo; bits<4> RdHi; bits<4> Rm; bits<4> Rn; let Inst{19-16} = RdHi; let Inst{15-12} = RdLo; let Inst{11-8} = Rm; let Inst{3-0} = Rn; } let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in { def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), 4, IIC_iMAC64, [], (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), 4, IIC_iMAC64, [], (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; } } // hasSideEffects // Most significant word multiply def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> { let Inst{15-12} = 0b1111; } def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, (i32 0)))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> { let Inst{15-12} = 0b1111; } def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra", [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, Requires<[IsARM, HasV6, UseMulOps]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, GPR:$Ra))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>, Requires<[IsARM, HasV6, UseMulOps]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", [(set GPR:$Rd, (ARMsmmlsr GPR:$Rn, GPR:$Rm, GPR:$Ra))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; multiclass AI_smul { def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (bb_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (bt_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (tb_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (tt_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; } multiclass AI_smla { let DecoderMethod = "DecodeSMLAInstruction" in { def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, (bb_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, (bt_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, (tb_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, (tt_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; } } defm SMUL : AI_smul<"smul">; defm SMLA : AI_smla<"smla">; // Halfword multiply accumulate long: SMLAL. class SMLAL opc1, string asm> : AMulxyI64<0b0001010, opc1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi), IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>, RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV5TE]>, Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; def SMLALBB : SMLAL<0b00, "smlalbb">; def SMLALBT : SMLAL<0b10, "smlalbt">; def SMLALTB : SMLAL<0b01, "smlaltb">; def SMLALTT : SMLAL<0b11, "smlaltt">; def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), (SMLALBB $Rn, $Rm, $RLo, $RHi)>; def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), (SMLALBT $Rn, $Rm, $RLo, $RHi)>; def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), (SMLALTB $Rn, $Rm, $RLo, $RHi)>; def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), (SMLALTT $Rn, $Rm, $RLo, $RHi)>; // Helper class for AI_smld. class AMulDualIbase : AI, Requires<[IsARM, HasV6]> { bits<4> Rn; bits<4> Rm; let Inst{27-23} = 0b01110; let Inst{22} = long; let Inst{21-20} = 0b00; let Inst{11-8} = Rm; let Inst{7} = 0; let Inst{6} = sub; let Inst{5} = swap; let Inst{4} = 1; let Inst{3-0} = Rn; } class AMulDualI : AMulDualIbase { bits<4> Rd; let Inst{15-12} = 0b1111; let Inst{19-16} = Rd; } class AMulDualIa : AMulDualIbase { bits<4> Ra; bits<4> Rd; let Inst{19-16} = Rd; let Inst{15-12} = Ra; } class AMulDualI64