diff --git a/llvm/utils/schedtool/README.md b/llvm/utils/schedtool/README.md new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/README.md @@ -0,0 +1,121 @@ +schedtools is a collection of tools used to automatically generate llvm x86 schedule model. Input to it is a json file containing scheduling information and some additional infomation for all llvm x86 instructions. This tool can also verify existing schedule mode comparing to given input json. + +## Usage Examples +Given input json, generate alderlake-p schedule model: + + smg gen --target-cpu=alderlake-p ADLP.json -o X86SchedAlderlakeP.td + +Given input json, verify alderlake-p schedule model: + + smg verify --target-cpu=alderlake-p ADLP.json + +Generate alderlake-p input json (refer to [Tools](##Tools) for more detail): + + llvm-tblgen -I llvm/include llvm/lib/Target/X86/X86.td -I llvm/lib/Target/X86/ --gen-x86-inst-sched-info | + add_xed_info.py --xed /obj/wkit/examples/obj/xed | + add_uops_uopsinfo.py --inst-xml instructions.xml --arch-name=ADL-P | + add_adl_p_uopsinfo.py --adl-p-json tpt_lat-glc-client.json | + add_smv_uopsinfo.py --ref-cpu=skylake --target-cpu=alderlake-p -o input.json + +## Input JSON Format + + { + "AAA": { + "SchedReads": [], + "SchedWrites": [ + { + "Name": "WriteMicrocoded", + "Type": "SchedWrite" + } + ], + "XedInfo": { + "IsaSet": "I86" + }, + "Port": [[1, [0, 1, 5, 6 ]]], + "Uops": 1, + "Tp": 0.25, + "Latency": 100, + }, + ... + } + +Input json should be a dict which uses llvm x86 instruction's opcode as its key. It's value contains a list of information to describe this opcode. +"SchedReads" and "SchedWrites" must be presented. +"XedInfo" is optional. If it is presented, "IsaSet" must be presented. It is used to determin if this instruction is supported by specifc target. +"Port", "Uops", "Tp", "Latency" are optional. "Port" format is [[num\_uop\_a, ports of uop\_a], ...]. + +## Tools +Below is useful tools to assist in generating input json. + +### X86InstSchedInfo Emitter +This is a llvm tablegen backend that is capable to enumerate all x86 instruction's "SchedReads" and "SchedWrites" to form an initial json file. In addition, it can also enumerate an asm string for each matchable instructions(in AsmMatcher perspective). This asm string can be used by other tool to generate "XedInfo". + +Usage: + + git am /llvm-patch/0003-Add-gen-x86-inst-sched-info-to-emit-x86-instruction-.patch + rebuild llvm && cd llvm-dir + llvm-tblgen -I llvm/include llvm/lib/Target/X86/X86.td -I llvm/lib/Target/X86/ --gen-x86-inst-sched-info -o input1.json + +Here's piece of output json. "Modes" indicates all valid encoding modes(16bit, 32bit, 64bit). + + { + "AAA": { + "AsmString": "aaa", + "Modes": [32, 16], + "SchedReads": [], + "SchedWrites": [ + { + "Name": "WriteMicrocoded", + "Type": "SchedWrite" + } + ] + }, + ... + } + +### tools/add\_xed\_info.py +This tool is used to add "XedInfo" to input json. +Input to add\_xed\_info.py is normally a json file generated by [X86InstSchedInfo emitter](###x86instschedinfo_emitter). Output is also a json with more rich information. +This tool will first use llvm-mc (make sure it is in path) to verify and try to correct "AsmString" so the opcode of matched MCInst is the same with the input json. It then encode the corrected "AsmString" and store it into the output json. After all "AsmString" being fixed and encoded, This tool uses xed to decode the "Encoding" and extract "IForm", "IsaSet", etc to form "XedInfo" in output json. + +Usage: + + # Build xed + git clone https://github.com/intelxed/xed.git + git clone https://github.com/intelxed/mbuild.git + cd xed + # Print more rich info when decoding. + git am /xed-patch/0001-Dump-eosz-and-operand-s-xtype-width-when-verbosity-i.patch + ./mfile.py + ./mfile.py examples + + cd llvm-dir + git am /llvm-patch/0002-Support-debug-only-print-opcode-to-llvm-mc.patch + rebuild llvm + add_xed_info.py --xed /obj/wkit/examples/obj/xed --jf input1.json -o input2.json + +### tools/add\_uops\_uopsinfo.py +This tool is used to add corresponding "Port", "Uops", "Tp", "latency" from uops.info to input json. It won't update those info it already exited. +Input json to add\_uops\_uopsinfo.py must contain "XedInfo" because it uses this to find the corresponding record in uops.info. Another input to this tool is instructions.xml file. You can download it from [uops.info](https://uops.info/xml.html). + +Usage: + + # arch-name is "architecture name" in instructions.xml + add_uops_uopsinfo.py --inst-xml instructions.xml --arch-name=ADL-P --jf input2.json -o input3.json + +### tools/add\_adl\_p\_uopsinfo.py +This tool is used to add "Port", "Uops", "Tp", "Latency" from json provided by [intel](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html) for GLC. It won't update those info it already exited. + +Usage: + + add_adl_p_uopsinfo.py --adl-p-json tpt_lat-glc-client.json --jf input3.json -o input4.json + +### tools/add\_smv\_uopsinfo.py +This tool is used to add "Port", "Uops", "Tp", "Latency" from existing schedule model. There are always some corner instructions that we don't have much scheduling information about them from uops.info or other source. This blocked us to generate a relative complete schedule model. Thus this tools is helpful since we can find nearly all instruction's scheduling info from existing schedule model though they may not be correct. Currently only part of reference targets are supported since we need to map ports between target-cpu and ref-cpu. + +Usage: + + cd llvm-dir + git am /llvm-patch/0001-Add-llvm-smv-tool-to-auto-generate-instruction-sched.patch + rebuild llvm + add_smv_uopsinfo.py --ref-cpu=skylake --target-cpu=alderlake-p --jf input4.json -o input5.json diff --git a/llvm/utils/schedtool/lib/__init__.py b/llvm/utils/schedtool/lib/__init__.py new file mode 100644 diff --git a/llvm/utils/schedtool/lib/info_parser.py b/llvm/utils/schedtool/lib/info_parser.py new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/lib/info_parser.py @@ -0,0 +1,88 @@ +from lib.llvm_instr import * +from lib import utils + + +def parse_llvm_instr_info(instr_info, target_cpu): + def scan_schedwrite(write_desc): + write_type = write_desc['Type'] + if write_type == 'SchedWrite' or write_type == 'X86FoldableSchedWrite': + return SchedWrite(write_desc['Name']) + elif write_type == 'WriteSequence': + name = write_desc['Name'] + writes = [ + scan_schedwrite(next_desc) + for next_desc in write_desc['Writes'] + ] + repeat = write_desc['Repeat'] + return WriteSequence(name, writes, repeat) + else: + raise TypeError(f'Unknown schedwrite type: {write_type}') + + llvm_instrs = [] + for opcode, desc in instr_info.items(): + schedreads, schedwrites = [], [] + for read_desc in desc['SchedReads']: + assert read_desc['Type'] == 'SchedRead', 'Unknown schedread type' + schedreads.append(SchedRead(read_desc['Name'])) + for write_desc in desc['SchedWrites']: + schedwrites.append(scan_schedwrite(write_desc)) + isa_set = desc['XedInfo']['IsaSet'] if 'XedInfo' in desc else None + llvm_instr = LLVMInstr(opcode, schedreads, schedwrites, isa_set) + if 'Port' in desc and not llvm_instr.is_invalid(target_cpu): + uops = [] + latency = desc.get('Latency', target_cpu.max_latency) + throughput = desc.get('Tp', None) + for item in desc['Port']: + uop = Uop(ports=[Port(pn) for pn in item[1]]) + assert all(Port(pn) is Port.INVALID_PORT or + Port(pn) in target_cpu.all_ports for pn in item[1]),\ + f'Found invalid port in {item[1]}' + uops.extend([uop] * item[0]) + num_uops = desc.get('Uops', len(uops)) + llvm_instr.set_uops_info( + UopsInfo(latency, throughput, uops, num_uops)) + llvm_instrs.append(llvm_instr) + return llvm_instrs + + +def infer_res(resources, resource_cycles): + class Node: + def __init__(self, res, cycs): + self.res = res + self.cycs = cycs + self.next = [] + + nodes = [ + Node(res, cycles) for res, cycles in zip(resources, resource_cycles) + ] + for node in nodes: + for other in nodes: + if other != node and utils.listcontain(other.res, node.res): + node.next.append(other) + nodes.sort(key=lambda x: len(x.next), reverse=True) + for node in nodes: + if node.cycs > 0: + for next_node in node.next: + assert next_node.cycs > 0 + next_node.cycs -= node.cycs + + leaf_res, leaf_res_cycs = [], [] + for node in nodes: + if node.cycs > 0: + leaf_res.append(node.res) + leaf_res_cycs.append(node.cycs) + return leaf_res, leaf_res_cycs + + +def parse_smv_instr_info(instr_info, target_cpu): + smv_instrs = [] + for opcode, desc in instr_info.items(): + resources, resource_cycles = [], [] + for ports_name, cycles in desc['WriteRes'].items(): + resources.append(target_cpu.parse_ports_name(ports_name)) + resource_cycles.append(cycles) + resources, resource_cycles = infer_res(resources, resource_cycles) + smv_instrs.append( + SMVInstr(opcode, int(desc['Latency']), int(desc['NumUops']), + float(desc['RThroughput']), resources, resource_cycles)) + return smv_instrs diff --git a/llvm/utils/schedtool/lib/llvm_instr.py b/llvm/utils/schedtool/lib/llvm_instr.py new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/lib/llvm_instr.py @@ -0,0 +1,399 @@ +import re + +try: + import utils +except ModuleNotFoundError: + import lib.utils as utils + + +class Singleton(type): + ''' + Each subclass should implement get_key static method to hash a uniq id + for args passed to init and shouldn't use _instances as attr. + ''' + def __new__(meta_cls, class_name, base_classes, attrs): + attrs['_instances'] = {} + cls = super().__new__(meta_cls, class_name, base_classes, attrs) + + # Create get static method for all subclasses. + def get(*args, **kwargs): + key = cls.get_key(*args, **kwargs) + return cls._instances.get(key) + + cls.get = get + return cls + + def __call__(cls, *arg, **kwargs): + key = cls.get_key(*arg, **kwargs) + if key not in cls._instances: + cls._instances[key] = super().__call__(*arg, **kwargs) + return cls._instances[key] + + +class ReadOnly: + def __set_name__(self, owner, name): + self.private_name = '_' + name + + def __get__(self, obj, objtype=None): + return getattr(obj, self.private_name) + + def __set__(self, obj, value): + raise TypeError("Can't assign to read only type.") + + +class Resource(metaclass=Singleton): + pass + + +class Port(Resource): + def __init__(self, number): + assert isinstance(number, int), 'Expect int type' + self._number = number + + @staticmethod + def get_key(number): + return number + + def __lt__(self, other): + return self._number < other._number + + def __str__(self): + return f'{self._number}' + + def __repr__(self): + return self.__str__() + + @staticmethod + def gets(nums): + return tuple(Port(num) for num in nums) + + class GetInvalidPort: + def __get__(self, obj, objtype=None): + assert objtype is Port + return objtype(-1) + + INVALID_PORT = GetInvalidPort() + + +class Uop: + ''' Port, latency and throughput info for micro-op ''' + def __init__(self, ports, latency=None, throughput=None): + assert len(ports) > 0 + assert latency is None or isinstance(latency, int) + assert throughput is None or isinstance(throughput, float) + self.ports = tuple(sorted(ports)) + self.latency = latency + self.throughput = throughput + + @staticmethod + def get_key(ports, latency=None, throughput=None): + return (tuple(sorted(ports)), latency, throughput) + + def __repr__(self): + return str(self.ports) + + def __lt__(self, other): + if self.ports != other.ports: + return self.ports < other.ports + if self.latency != other.latency: + return utils.lt_none(self.latency, other.latency) + if self.throughput != other.throughput: + return utils.lt_none(self.throughput, other.throughput) + return False + + +class UopsInfo: + ''' Uops info for instruction. ''' + def __init__(self, latency, throughput, uops, num_uops): + assert all(x is not None for x in (latency, uops, num_uops)) + assert isinstance(latency, int) + assert isinstance(throughput, (type(None), float)) + self.latency = latency + self.throughput = throughput + self.uops = tuple(sorted(uops)) + self.num_uops = num_uops + + @property + def ports(self): + return tuple(uop.ports for uop in self.uops) + + @staticmethod + def get_key(latency, throughput, uops): + return (latency, throughput, tuple(sorted(uops))) + + def __lt__(self, other): + if self.latency != other.latency: + return self.latency < other.latency + if self.throughput != other.throughput: + return self.throughput < other.throughput + if len(self.uops) != len(other.uops): + return len(self.uops) < len(other.uops) + if self.uops != other.uops: + for uop_a, uop_b in zip(self.uops, other.uops): + if uop_a != uop_b: + return uop_a < uop_b + return False + + def __repr__(self): + return (f'\n' + f' latency = {self.latency}\n' + f' throughput = {self.throughput}\n' + f' num_uops = {self.num_uops}\n' + f' uops = {self.uops}\n') + + def __str__(self): + return self.__repr__() + + +class SchedWrite(metaclass=Singleton): + def __init__(self, name): + self.name = name + self.__is_support = True + + # Each instruction may associate with many schedwrites. schedwrite that + # is removeable for all instructions are considered to be aux schedwrite. + # Currenty, each instruction only have 1 non aux schedwrite. + # For simplicity, we can manually define aux schedwrite so that only 1 + # schedwrite need to be infered. + self.__is_aux = False + + def set_resources(self, + resources, + resource_cycles, + latency, + num_uops, + is_aux=False): + assert len(resources) == len(resource_cycles) + assert all(x is not None + for x in (resources, resource_cycles, latency, num_uops)) + assert num_uops >= 0 and latency >= 0 + self.resources = tuple(resources) + self.resource_cycles = tuple(resource_cycles) + self.latency = latency + self.num_uops = num_uops + self.__is_aux = is_aux + + def set_supported(self, value): + self.__is_support = value + + def is_supported(self): + return self.__is_support + + def is_aux(self): + return self.__is_aux + + @staticmethod + def get_key(name): + return name + + def get_all(): + ''' Get all schedwrites created so far. ''' + return tuple(SchedWrite._instances.values()) + + def is_complete(self): + return all( + hasattr(self, attr) for attr in + ['resources', 'resource_cycles', 'latency', 'num_uops']) + + def __str__(self): + return f'{self.name}' + + def __repr__(self): + return self.__str__() + + def __hash__(self): + return hash(self.name) + + def __lt__(self, other): + # Basic class comes first. + if type(other) is not type(self): + return issubclass(type(other), type(self)) + return self.name < other.name + + +class WriteSequence(SchedWrite): + def __init__(self, name, writes, repeat): + super().__init__(name) + self._writes = writes + self._repeat = repeat + assert not (hasattr(self, '__is_support') or hasattr(self, '__is_aux')) + + @staticmethod + def get_key(name, writes, repeat): + return name + + def is_complete(self): + return all(x.is_complete() for x in self._writes) + + def is_supported(self): + return all(x.is_supported() for x in self._writes) + + def is_aux(self): + return all(x.is_aux() for x in self.expand()) + + def set_resources(self, *args, **kwargs): + raise TypeError('Cant set_resources on WriteSequence') + + @property + def latency(self): + return sum(leaf_write.latency for leaf_write in self.expand()) + + @property + def num_uops(self): + return sum(leaf_write.num_uops for leaf_write in self.expand()) + + @property + def resources(self): + resources = [] + for leaf_write in self.expand(): + resources.extend(leaf_write.resources) + return tuple(resources) + + def expand(self): + ''' + Expand WriteSequence to leaf schedwrites. + ''' + leaf_writes = [] + for i in range(self._repeat): + for sub_write in self._writes: + if type(sub_write) is WriteSequence: + leaf_writes.extend(sub_write.expand()) + else: + leaf_writes.append(sub_write) + return leaf_writes + + def __str__(self): + return f'{self.name} writes:{self._writes} repeat:{self._repeat}' + + def __repr__(self): + return self.__str__() + + +class SchedWriteRes(SchedWrite): + def __init__(self, + resources, + resource_cycles, + latency, + num_uops, + prefix=""): + # prefix will be ignored if SchedWriteRes with same resources existed. + name = f'{prefix}WriteResGroup{len(SchedWriteRes._instances)}' + super().__init__(name) + self.set_resources(resources=resources, + resource_cycles=resource_cycles, + latency=latency, + num_uops=num_uops) + + def is_supported(self): + return True + + def is_aux(self): + return False + + @staticmethod + def get_key(resources, resource_cycles, latency, num_uops, prefix): + return (resources, resource_cycles, latency, num_uops) + + def __lt__(self, other): + if type(other) is not type(self): + return super().__lt__(other) + + idx0 = int(re.match(r'^\w+WriteResGroup(\d+)', self.name).group(1)) + idx1 = int(re.match(r'^\w+WriteResGroup(\d+)', other.name).group(1)) + assert idx0 != idx1, 'duplicate SchedWriteRes' + return idx0 < idx1 + + +class SchedRead(metaclass=Singleton): + def __init__(self, name: str): + self.name = name + + @staticmethod + def get_key(name): + return name + + def __str__(self): + return f'{self.name}' + + def __repr__(self): + return self.__str__() + + +class LLVMInstr: + ''' Instruction defined in td file ''' + def __init__(self, opcode, schedreads, schedwrites, isa_set): + self.opcode = opcode + self.schedreads = schedreads + self.schedwrites = schedwrites + self.isa_set = isa_set + self._use_instrw = False + + def set_uops_info(self, uops_info): + self.uops_info = uops_info + + def set_use_instrw(self, value): + self._use_instrw = value + + def use_instrw(self): + return self._use_instrw + + def has_uops_info(self): + return hasattr(self, 'uops_info') + + def is_invalid(self, target_cpu): + return (self.isa_set is not None + and self.isa_set not in target_cpu.valid_isa_set) + + def replace_or_add_schedrw(self, + old_schedrw, + new_schedrw, + is_read=False, + *, + not_null=False): + schedrws = self.schedreads if is_read else self.schedwrites + if not not_null and old_schedrw is None: + schedrws.append(new_schedrw) + else: + schedrws[schedrws.index(old_schedrw)] = new_schedrw + + def compute_latency(self): + return max(schedwrite.latency for schedwrite in self.schedwrites) + + def compute_num_uops(self): + return sum(schedwrite.num_uops for schedwrite in self.schedwrites) + + def compute_resources(self): + resources = [] + for schedwrite in self.schedwrites: + resources.extend(schedwrite.resources) + return tuple(resources) + + def __repr__(self): + return (f'{self.opcode}:\n' + f' schedreads = {self.schedreads}\n' + f' schedwrites = {self.schedwrites}\n' + f' isa_set = {self.isa_set}\n' + f' use_instrw = {self._use_instrw}\n' + f' uops_info = {getattr(self, "uops_info", None)}\n') + + def __str__(self): + return self.__repr__() + + +class SMVInstr: + def __init__(self, opcode, latency, num_uops, throughput, resources, + resource_cycles): + self.opcode = opcode + self.latency = latency + self.num_uops = num_uops + self.throughput = throughput + self.resources = resources + self.resource_cycles = resource_cycles + + def __repr__(self): + return (f'{self.opcode}:\n' + f' latency = {self.latency}\n' + f' num_uops = {self.num_uops}\n' + f' throughput = {self.throughput}\n' + f' resources = {self.resources}\n' + f' resource_cycles = {self.resource_cycles}\n') diff --git a/llvm/utils/schedtool/lib/target.py b/llvm/utils/schedtool/lib/target.py new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/lib/target.py @@ -0,0 +1,424 @@ +import os, unittest + +try: + import utils + from llvm_instr import Port, SchedWrite +except ModuleNotFoundError: + from lib import utils + from lib.llvm_instr import Port, SchedWrite + +workdir = f'{os.path.dirname(os.path.realpath(__file__))}' + + +def get_target(target_cpu): + target_map = { + 'alderlake-p': AlderlakeP, + 'sapphirerapids': SapphireRapids, + 'skylake': Skylake, + 'skylake-avx512': SkylakeServer, + 'icelake-server': IcelakeServer, + } + if target_cpu not in target_map: + raise NotImplementedError(f'Unknown target cpu "{target_cpu}"\n' + f'Valid target is {list(target_map.keys())}') + return target_map[target_cpu]() + + +class TargetCPU: + def __init__(self, short_name, proc_name, model_name=None): + self.short_name = short_name + self.proc_name = proc_name + self.model_name = f'{proc_name.capitalize()}Model' \ + if model_name is None else model_name + self.all_ports = None + + def get_ports_name(self, ports): + if len(ports) == 0: + return '' + + if utils.cmplist(ports, self.all_ports): + return f'{self.short_name}PortAny' + + if utils.cmplist(ports, (Port.INVALID_PORT, )): + return f'{self.short_name}PortInvalid' + + assert all(port in self.all_ports for port in ports) + return utils.nums2str((str(port) for port in ports), 2, '_', + f'{self.short_name}Port') + + def parse_ports_name(self, ports_name: str): + ''' Convert ports name to Port. ''' + if ports_name == f'{self.short_name}PortAny': + return self.all_ports + + if ports_name == f'{self.short_name}PortInvalid': + return (Port.INVALID_PORT, ) + + ports = [] + for num in utils.str2nums(ports_name, '_', f'{self.short_name}Port'): + assert Port(num) in self.all_ports + ports.append(Port(num)) + return tuple(ports) + + def lat2str(self, latency): + if latency == self.max_latency: + return f'{self.model_name}.MaxLatency' + else: + return str(latency) + + +class AlderlakeP(TargetCPU): + valid_isa_set = frozenset(''' + 3DNOW_PREFETCH ADOX_ADCX AES AVX + AVX2 AVX2GATHER AVXAES AVX_GFNI + AVX_VNNI BMI1 BMI2 CET + CLDEMOTE CLFLUSHOPT CLFSH CLWB + CMOV CMPXCHG16B F16C FAT_NOP + FCMOV FMA FXSAVE FXSAVE64 + GFNI HRESET I186 I286PROTECTED + I286REAL I386 I486 I486REAL + I86 INVPCID KEYLOCKER KEYLOCKER_WIDE + LAHF LONGMODE LZCNT MONITOR + MOVBE MOVDIR PAUSE PCLMULQDQ + PCONFIG PENTIUMMMX PENTIUMREAL PKU + POPCNT PPRO PPRO_UD0_SHORT PREFETCHW + PREFETCH_NOP PTWRITE RDPID RDPMC + RDRAND RDSEED RDTSCP RDWRFSGS + SERIALIZE SHA SMAP SMX + SSE SSE2 SSE2MMX SSE3 + SSE3X87 SSE4 SSE42 SSEMXCSR + SSE_PREFETCH SSSE3 SSSE3MMX VAES + VMFUNC VPCLMULQDQ VTX WAITPKG + WBNOINVD X87 XSAVE XSAVEC + XSAVEOPT XSAVES + '''.split()) + + def __init__(self): + super().__init__('ADLP', 'alderlake', 'AlderlakePModel') + self.all_ports = tuple( + Port(num) for num in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)) + self.load_ports = Port.gets((2, 3, 11)) + self.load_latency = 5 + self.max_latency = 100 + self.template_td = f'{workdir}/template/alderlake-p.td' + + # Manually set some schedwrites resources instead of infering it. + self.__set_schedwrite_resource() + + def __set_schedwrite_resource(self): + ADLPPort04_09 = Port.gets((4, 9)) + ADLPPort07_08 = Port.gets((7, 8)) + + # Manually define aux SchedWrite here. + SchedWrite('WriteIMulH').set_resources(resources=(), + resource_cycles=(), + latency=3, + num_uops=1, + is_aux=True) + SchedWrite('WriteIMulHLd').set_resources(resources=(), + resource_cycles=(), + latency=3, + num_uops=1, + is_aux=True) + SchedWrite('WriteRMW').set_resources(resources=(self.load_ports, + ADLPPort04_09, + ADLPPort07_08), + resource_cycles=(1, 1, 1), + latency=1, + num_uops=3, + is_aux=True) + SchedWrite('WriteVecMaskedGatherWriteback').set_resources( + resources=(), + resource_cycles=(), + latency=self.load_latency, + num_uops=0, + is_aux=True) + + # Manually define non-aux SchedWrite here. + SchedWrite('WriteZero').set_resources(resources=(), + resource_cycles=(), + latency=1, + num_uops=1) + SchedWrite('WriteLoad').set_resources(resources=(self.load_ports, ), + resource_cycles=(1, ), + latency=self.load_latency, + num_uops=1) + + +class SapphireRapids(TargetCPU): + valid_isa_set = frozenset(''' + 3DNOW_PREFETCH ADOX_ADCX AES AMX_BF16 + AMX_INT8 AMX_TILE AVX AVX2 + AVX2GATHER AVX512BW_128 AVX512BW_128N AVX512BW_256 + AVX512BW_512 AVX512BW_KOP AVX512CD_128 AVX512CD_256 + AVX512CD_512 AVX512DQ_128 AVX512DQ_128N AVX512DQ_256 + AVX512DQ_512 AVX512DQ_KOP AVX512DQ_SCALAR AVX512F_128 + AVX512F_128N AVX512F_256 AVX512F_512 AVX512F_KOP + AVX512F_SCALAR AVX512_BF16_128 AVX512_BF16_256 AVX512_BF16_512 + AVX512_BITALG_128 AVX512_BITALG_256 AVX512_BITALG_512 AVX512_FP16_128 + AVX512_FP16_128N AVX512_FP16_256 AVX512_FP16_512 AVX512_FP16_SCALAR + AVX512_GFNI_128 AVX512_GFNI_256 AVX512_GFNI_512 AVX512_IFMA_128 + AVX512_IFMA_256 AVX512_IFMA_512 AVX512_VAES_128 AVX512_VAES_256 + AVX512_VAES_512 AVX512_VBMI2_128 AVX512_VBMI2_256 AVX512_VBMI2_512 + AVX512_VBMI_128 AVX512_VBMI_256 AVX512_VBMI_512 AVX512_VNNI_128 + AVX512_VNNI_256 AVX512_VNNI_512 AVX512_VP2INTERSECT_128 + AVX512_VP2INTERSECT_256 AVX512_VP2INTERSECT_512 AVX512_VPCLMULQDQ_128 + AVX512_VPCLMULQDQ_256 AVX512_VPCLMULQDQ_512 AVX512_VPOPCNTDQ_128 + AVX512_VPOPCNTDQ_256 AVX512_VPOPCNTDQ_512 AVXAES + AVX_GFNI AVX_VNNI BMI1 BMI2 + CET CLDEMOTE CLFLUSHOPT CLFSH + CLWB CMOV CMPXCHG16B ENQCMD + F16C FAT_NOP FCMOV FMA + FXSAVE FXSAVE64 GFNI I186 + I286PROTECTED I286REAL I386 I486 + I486REAL I86 INVPCID LAHF + LONGMODE LZCNT MONITOR MOVBE + MOVDIR PAUSE PCLMULQDQ PCONFIG + PENTIUMMMX PENTIUMREAL PKU POPCNT + PPRO PPRO_UD0_LONG PREFETCHW PREFETCH_NOP + PTWRITE RDPID RDPMC RDRAND + RDSEED RDTSCP RDWRFSGS RTM + SERIALIZE SGX SGX_ENCLV SHA + SMAP SMX SSE SSE2 + SSE2MMX SSE3 SSE3X87 SSE4 + SSE42 SSEMXCSR SSE_PREFETCH SSSE3 + SSSE3MMX TDX TSX_LDTRK UINTR + VAES VMFUNC VPCLMULQDQ VTX + WAITPKG WBNOINVD X87 XSAVE + XSAVEC XSAVEOPT XSAVES + '''.split()) + + def __init__(self): + super().__init__('SPR', 'sapphirerapids', 'SapphireRapidsModel') + self.all_ports = tuple( + Port(num) for num in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)) + self.load_ports = Port.gets((2, 3, 11)) + self.load_latency = 5 + self.max_latency = 100 + self.template_td = f'{workdir}/template/sapphirerapids.td' + + # Manually set some schedwrites resources instead of infering it. + self.__set_schedwrite_resource() + + def __set_schedwrite_resource(self): + SPRPort04_09 = Port.gets((4, 9)) + SPRPort07_08 = Port.gets((7, 8)) + SPRPort00_06 = Port.gets((0, 6)) + + # Manually define aux SchedWrite here. + SchedWrite('WriteIMulH').set_resources(resources=(), + resource_cycles=(), + latency=3, + num_uops=1, + is_aux=True) + SchedWrite('WriteIMulHLd').set_resources(resources=(), + resource_cycles=(), + latency=3, + num_uops=1, + is_aux=True) + SchedWrite('WriteRMW').set_resources(resources=(self.load_ports, + SPRPort04_09, + SPRPort07_08), + resource_cycles=(1, 1, 1), + latency=1, + num_uops=3, + is_aux=True) + SchedWrite('WriteVecMaskedGatherWriteback').set_resources( + resources=(), + resource_cycles=(), + latency=self.load_latency, + num_uops=0, + is_aux=True) + + # Manually define non-aux SchedWrite here. + SchedWrite('WriteZero').set_resources(resources=(), + resource_cycles=(), + latency=1, + num_uops=1) + SchedWrite('WriteLoad').set_resources(resources=(self.load_ports, ), + resource_cycles=(1, ), + latency=self.load_latency, + num_uops=1) + SchedWrite('WriteCMOV').set_resources(resources=(SPRPort00_06, ), + resource_cycles=(1, ), + latency=1, + num_uops=1) + + +class Skylake(TargetCPU): + valid_isa_set = frozenset(''' + 3DNOW_PREFETCH ADOX_ADCX AES AVX + AVX2 AVX2GATHER AVXAES BMI1 + BMI2 CLFLUSHOPT CLFSH CMOV + CMPXCHG16B F16C FAT_NOP FCMOV + FMA FXSAVE FXSAVE64 I186 + I286PROTECTED I286REAL I386 I486 + I486REAL I86 INVPCID LAHF + LONGMODE LZCNT MONITOR MOVBE + MPX PAUSE PCLMULQDQ PENTIUMMMX + PENTIUMREAL POPCNT PPRO PPRO_UD0_LONG + PREFETCHW PREFETCH_NOP RDPMC RDRAND + RDSEED RDTSCP RDWRFSGS RTM + SGX SMAP SMX SSE + SSE2 SSE2MMX SSE3 SSE3X87 + SSE4 SSE42 SSEMXCSR SSE_PREFETCH + SSSE3 SSSE3MMX VMFUNC VTX + X87 XSAVE XSAVEC XSAVEOPT + XSAVES + '''.split()) + + def __init__(self): + super().__init__('SKL', 'skylake') + self.all_ports = tuple(Port(num) for num in (0, 1, 2, 3, 4, 5, 6, 7)) + self.load_ports = Port.gets((2, 3)) + self.load_latency = 5 + self.max_latency = 100 + + def parse_ports_name(self, ports_name: str): + ''' Convert ports name to Port. ''' + if ports_name == f'{self.short_name}PortAny': + return self.all_ports + + if ports_name in (f'{self.short_name}Divider', + f'{self.short_name}FPDivider'): + return (Port.INVALID_PORT, ) + + ports = [] + for num in ports_name[len(f'{self.short_name}Port'):]: + num = int(num) + assert Port(num) in self.all_ports + ports.append(Port(num)) + return tuple(ports) + + +class SkylakeServer(TargetCPU): + valid_isa_set = frozenset(''' + 3DNOW_PREFETCH ADOX_ADCX AES AVX + AVX2 AVX2GATHER AVX512BW_128 AVX512BW_128N + AVX512BW_256 AVX512BW_512 AVX512BW_KOP AVX512CD_128 + AVX512CD_256 AVX512CD_512 AVX512DQ_128 AVX512DQ_128N + AVX512DQ_256 AVX512DQ_512 AVX512DQ_KOP AVX512DQ_SCALAR + AVX512F_128 AVX512F_128N AVX512F_256 AVX512F_512 + AVX512F_KOP AVX512F_SCALAR AVXAES BMI1 + BMI2 CLFLUSHOPT CLFSH CLWB + CMOV CMPXCHG16B F16C FAT_NOP + FCMOV FMA FXSAVE FXSAVE64 + I186 I286PROTECTED I286REAL I386 + I486 I486REAL I86 INVPCID + LAHF LONGMODE LZCNT MONITOR + MOVBE MPX PAUSE PCLMULQDQ + PENTIUMMMX PENTIUMREAL PKU POPCNT + PPRO PPRO_UD0_LONG PREFETCHW PREFETCH_NOP + RDPMC RDRAND RDSEED RDTSCP + RDWRFSGS RTM SGX SMAP + SMX SSE SSE2 SSE2MMX + SSE3 SSE3X87 SSE4 SSE42 + SSEMXCSR SSE_PREFETCH SSSE3 SSSE3MMX + VMFUNC VTX X87 XSAVE + XSAVEC XSAVEOPT XSAVES + '''.split()) + + def __init__(self): + super().__init__('SKX', 'skylake-avx512') + self.all_ports = tuple(Port(num) for num in (0, 1, 2, 3, 4, 5, 6, 7)) + self.load_ports = Port.gets((2, 3)) + self.load_latency = 5 + self.max_latency = 100 + + def parse_ports_name(self, ports_name: str): + ''' + Convert ports name to Port. + ''' + if ports_name == f'{self.short_name}PortAny': + return self.all_ports + + if ports_name in (f'{self.short_name}Divider', + f'{self.short_name}FPDivider'): + return (Port.INVALID_PORT, ) + + ports = [] + for num in ports_name[len(f'{self.short_name}Port'):]: + num = int(num) + assert Port(num) in self.all_ports + ports.append(Port(num)) + return tuple(ports) + + +class IcelakeServer(TargetCPU): + valid_isa_set = frozenset(''' + 3DNOW_PREFETCH ADOX_ADCX AES AVX + AVX2 AVX2GATHER AVX512BW_128 AVX512BW_128N + AVX512BW_256 AVX512BW_512 AVX512BW_KOP AVX512CD_128 + AVX512CD_256 AVX512CD_512 AVX512DQ_128 AVX512DQ_128N + AVX512DQ_256 AVX512DQ_512 AVX512DQ_KOP AVX512DQ_SCALAR + AVX512F_128 AVX512F_128N AVX512F_256 AVX512F_512 + AVX512F_KOP AVX512F_SCALAR AVX512_BITALG_128 AVX512_BITALG_256 + AVX512_BITALG_512 AVX512_GFNI_128 AVX512_GFNI_256 AVX512_GFNI_512 + AVX512_IFMA_128 AVX512_IFMA_256 AVX512_IFMA_512 AVX512_VAES_128 + AVX512_VAES_256 AVX512_VAES_512 AVX512_VBMI2_128 AVX512_VBMI2_256 + AVX512_VBMI2_512 AVX512_VBMI_128 AVX512_VBMI_256 AVX512_VBMI_512 + AVX512_VNNI_128 AVX512_VNNI_256 AVX512_VNNI_512 AVX512_VPCLMULQDQ_128 + AVX512_VPCLMULQDQ_256 AVX512_VPCLMULQDQ_512 AVX512_VPOPCNTDQ_128 + AVX512_VPOPCNTDQ_256 AVX512_VPOPCNTDQ_512 AVXAES AVX_GFNI BMI1 + BMI2 CLFLUSHOPT CLFSH CLWB + CMOV CMPXCHG16B F16C FAT_NOP + FCMOV FCOMI FMA FXSAVE + FXSAVE64 GFNI I186 I286PROTECTED + I286REAL I386 I486 I486REAL + I86 INVPCID LAHF LONGMODE + LZCNT MONITOR MOVBE PAUSE + PCLMULQDQ PCONFIG PENTIUMMMX PENTIUMREAL + PKU POPCNT PPRO PPRO_UD0_LONG + PREFETCHW PREFETCH_NOP RDPID RDPMC + RDRAND RDSEED RDTSCP RDWRFSGS + RTM SGX SGX_ENCLV SHA + SMAP SMX SSE SSE2 + SSE2MMX SSE3 SSE3X87 SSE4 + SSE42 SSEMXCSR SSE_PREFETCH SSSE3 + SSSE3MMX VAES VMFUNC VPCLMULQDQ + VTX WBNOINVD X87 XSAVE + XSAVEC XSAVEOPT XSAVES + '''.split()) + + def __init__(self): + super().__init__('ICX', 'icelake-server') + self.all_ports = tuple( + Port(num) for num in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) + self.load_ports = Port.gets((2, 3)) + self.load_latency = 5 + self.max_latency = 100 + + def parse_ports_name(self, ports_name: str): + ''' + Convert ports name to Port. + ''' + if ports_name == f'{self.short_name}PortAny': + return self.all_ports + + if ports_name in (f'{self.short_name}Divider', + f'{self.short_name}FPDivider'): + return (Port.INVALID_PORT, ) + + ports = [] + for num in ports_name[len(f'{self.short_name}Port'):]: + num = int(num) + assert Port(num) in self.all_ports + ports.append(Port(num)) + return tuple(ports) + + +if __name__ == '__main__': + + class TargetChecker(unittest.TestCase): + def test_target(self): + target_cpu = AlderlakeP() + self.assertEqual(target_cpu.get_ports_name([]), '') + self.assertEqual(target_cpu.get_ports_name([Port(1), + Port(2)]), + 'ADLPPort01_02') + self.assertEqual(target_cpu.get_ports_name([Port.INVALID_PORT]), + 'ADLPPortInvalid') + self.assertEqual(target_cpu.parse_ports_name('ADLPPort1_3'), + (Port(1), Port(3))) + + unittest.main() diff --git a/llvm/utils/schedtool/lib/template/alderlake-p.td b/llvm/utils/schedtool/lib/template/alderlake-p.td new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/lib/template/alderlake-p.td @@ -0,0 +1,116 @@ +//===- X86SchedAlderlakeP.td - X86 Alderlake-P Scheduling ----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Alderlake-P core to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def AlderlakePModel : SchedMachineModel { + // Alderlake-P core can allocate 6 uops per cycle. + let IssueWidth = 6; // Based on allocator width. + let MicroOpBufferSize = 512; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Latency for microcoded instructions or instructions without latency info. + int MaxLatency = 100; + + // Based on the LSD (loop-stream detector) queue size (ST). + let LoopMicroOpBufferSize = 72; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = AlderlakePModel in { + +// Alderlake-P core can issue micro-ops to 12 different ports in one cycle. +def ADLPPort00 : ProcResource<1>; +def ADLPPort01 : ProcResource<1>; +def ADLPPort02 : ProcResource<1>; +def ADLPPort03 : ProcResource<1>; +def ADLPPort04 : ProcResource<1>; +def ADLPPort05 : ProcResource<1>; +def ADLPPort06 : ProcResource<1>; +def ADLPPort07 : ProcResource<1>; +def ADLPPort08 : ProcResource<1>; +def ADLPPort09 : ProcResource<1>; +def ADLPPort10 : ProcResource<1>; +def ADLPPort11 : ProcResource<1>; + +// Workaround to represent invalid ports. WriteRes shouldn't use this resource. +def ADLPPortInvalid : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def ADLPPort00_01 : ProcResGroup<[ADLPPort00, ADLPPort01]>; +def ADLPPort00_01_05 : ProcResGroup<[ADLPPort00, ADLPPort01, ADLPPort05]>; +def ADLPPort00_01_05_06 : ProcResGroup<[ADLPPort00, ADLPPort01, ADLPPort05, ADLPPort06]>; +def ADLPPort00_05 : ProcResGroup<[ADLPPort00, ADLPPort05]>; +def ADLPPort00_05_06 : ProcResGroup<[ADLPPort00, ADLPPort05, ADLPPort06]>; +def ADLPPort00_06 : ProcResGroup<[ADLPPort00, ADLPPort06]>; +def ADLPPort01_05 : ProcResGroup<[ADLPPort01, ADLPPort05]>; +def ADLPPort01_05_10 : ProcResGroup<[ADLPPort01, ADLPPort05, ADLPPort10]>; +def ADLPPort02_03 : ProcResGroup<[ADLPPort02, ADLPPort03]>; +def ADLPPort02_03_11 : ProcResGroup<[ADLPPort02, ADLPPort03, ADLPPort11]>; +def ADLPPort07_08 : ProcResGroup<[ADLPPort07, ADLPPort08]>; + +// EU has 112 reservation stations. +def ADLPPort00_01_05_06_10 : ProcResGroup<[ADLPPort00, ADLPPort01, ADLPPort05, + ADLPPort06, ADLPPort10]> { + let BufferSize = 112; +} + +// STD has 48 reservation stations. +def ADLPPort04_09 : ProcResGroup<[ADLPPort04, ADLPPort09]> { + let BufferSize = 48; +} + +// MEM has 72 reservation stations. +def ADLPPort02_03_07_08_11 : ProcResGroup<[ADLPPort02, ADLPPort03, ADLPPort07, + ADLPPort08, ADLPPort11]> { + let BufferSize = 72; +} + +// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available +// until 5 cycles after the memory operand. +def : ReadAdvance; + +// Vector loads are 6 cycles, so ReadAfterVec*Ld registers needn't be available +// until 6 cycles after the memory operand. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +def : ReadAdvance; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass ADLPWriteResPair ExePorts, + int Lat, list Res = [1], int UOps = 1, + int LoadLat = 5, int LoadUOps = 1> { + // Register variant is using a single cycle on ExePort. + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3/11 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, LoadUOps); + } +} diff --git a/llvm/utils/schedtool/lib/template/sapphirerapids.td b/llvm/utils/schedtool/lib/template/sapphirerapids.td new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/lib/template/sapphirerapids.td @@ -0,0 +1,116 @@ +//=- X86SchedSapphireRapids.td - X86 SapphireRapids Scheduling *- tablegen -*=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for SapphireRapids to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SapphireRapidsModel : SchedMachineModel { + // SapphireRapids can allocate 6 uops per cycle. + let IssueWidth = 6; // Based on allocator width. + let MicroOpBufferSize = 512; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Latency for microcoded instructions or instructions without latency info. + int MaxLatency = 100; + + // Based on the LSD (loop-stream detector) queue size (ST). + let LoopMicroOpBufferSize = 72; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SapphireRapidsModel in { + +// SapphireRapids can issue micro-ops to 12 different ports in one cycle. +def SPRPort00 : ProcResource<1>; +def SPRPort01 : ProcResource<1>; +def SPRPort02 : ProcResource<1>; +def SPRPort03 : ProcResource<1>; +def SPRPort04 : ProcResource<1>; +def SPRPort05 : ProcResource<1>; +def SPRPort06 : ProcResource<1>; +def SPRPort07 : ProcResource<1>; +def SPRPort08 : ProcResource<1>; +def SPRPort09 : ProcResource<1>; +def SPRPort10 : ProcResource<1>; +def SPRPort11 : ProcResource<1>; + +// Workaround to represent invalid ports. WriteRes shouldn't use this resource. +def SPRPortInvalid :ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SPRPort00_01 : ProcResGroup<[SPRPort00, SPRPort01]>; +def SPRPort00_01_05 : ProcResGroup<[SPRPort00, SPRPort01, SPRPort05]>; +def SPRPort00_01_05_06 : ProcResGroup<[SPRPort00, SPRPort01, SPRPort05, SPRPort06]>; +def SPRPort00_05 : ProcResGroup<[SPRPort00, SPRPort05]>; +def SPRPort00_05_06 : ProcResGroup<[SPRPort00, SPRPort05, SPRPort06]>; +def SPRPort00_06 : ProcResGroup<[SPRPort00, SPRPort06]>; +def SPRPort01_05 : ProcResGroup<[SPRPort01, SPRPort05]>; +def SPRPort01_05_10 : ProcResGroup<[SPRPort01, SPRPort05, SPRPort10]>; +def SPRPort02_03 : ProcResGroup<[SPRPort02, SPRPort03]>; +def SPRPort02_03_11 : ProcResGroup<[SPRPort02, SPRPort03, SPRPort11]>; +def SPRPort07_08 : ProcResGroup<[SPRPort07, SPRPort08]>; + +// EU has 112 reservation stations. +def SPRPort00_01_05_06_10 : ProcResGroup<[SPRPort00, SPRPort01, SPRPort05, + SPRPort06, SPRPort10]> { + let BufferSize = 112; +} + +// STD has 48 reservation stations. +def SPRPort04_09 : ProcResGroup<[SPRPort04, SPRPort09]> { + let BufferSize = 48; +} + +// MEM has 72 reservation stations. +def SPRPort02_03_07_08_11 : ProcResGroup<[SPRPort02, SPRPort03, SPRPort07, + SPRPort08, SPRPort11]> { + let BufferSize = 72; +} + +// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available +// until 5 cycles after the memory operand. +def : ReadAdvance; + +// Vector loads are 6 cycles, so ReadAfterVec*Ld registers needn't be available +// until 6 cycles after the memory operand. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +def : ReadAdvance; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SPRWriteResPair ExePorts, + int Lat, list Res = [1], int UOps = 1, + int LoadLat = 5, int LoadUOps = 1> { + // Register variant is using a single cycle on ExePort. + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3/11 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, LoadUOps); + } +} diff --git a/llvm/utils/schedtool/lib/utils.py b/llvm/utils/schedtool/lib/utils.py new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/lib/utils.py @@ -0,0 +1,277 @@ +import collections, os, re, unittest + + +def to_int(obj, base=10): + if isinstance(obj, int): + return obj + try: + return int(obj, base) + except: + return None + + +def nums2str(nums, width, sep='', prefix=''): + formated_nums = [str(int(num)).zfill(width) for num in nums] + return prefix + sep.join(formated_nums) + + +def str2nums(string, sep='', prefix=''): + formated_nums = string.lstrip(prefix).split(sep) + return [int(num) for num in formated_nums] + + +def listremove(a, b): + ''' Return "a - b". ''' + ac = list(a) + for i in b: + ac.remove(i) + return type(a)(ac) + + +def listdiff(a, b): + check = True + ac, bc = list(a), list(b) + while check: + check = False + for x in ac: + if x in bc: + ac.remove(x) + bc.remove(x) + check = True + break + return tuple(ac + bc) + + +def listcontain(a, b): + ''' + Return true is b is subset of a. + ''' + ac = list(a) + for x in b: + if x in ac: + ac.remove(x) + else: + return False + return True + + +def lt_none(a, b): + if (a is None and b is None) or a is None: + return True + elif b is None: + return False + else: + return a < b + + +def cmplist(a, b): + return collections.Counter(a) == collections.Counter(b) + + +def commonpostfix(strings): + inversed_strings = [string[::-1] for string in strings.copy()] + inversed_common_postfix = os.path.commonprefix(inversed_strings) + return inversed_common_postfix[::-1] + + +class RegexReducer: + ''' Reduce a list of regexes to more concise regexes. ''' + def __init__(self, diff_len_limit=2): + + # Determin what len of non-number diff are allowed. + # 0 means only digits diff are allowed. + # 1 means only 1 non-digits diff are allowed. + self.diff_len_limit = diff_len_limit + + def __is_all_digits(self, diff1, diff2): + return diff1.isdigit() and diff2.isdigit() + + def __is_under_limit(self, diff1, diff2): + return max(len(diff1), len(diff2)) <= self.diff_len_limit + + def __is_in_regex(self, string, begin, end): + probe = 0 + for i in range(end): + if string[i] == '(': + probe += 1 + elif string[i] == ')': + probe -= 1 + + if i >= begin and i < end and (probe != 0 or string[i] == '?'): + return True + return False + + def reduce_once(self, regexes_in): + assert isinstance(regexes_in, list), 'list type is required' + + changed = False + worklist, regexes_out = regexes_in.copy()[::-1], [] + while worklist: + specimen = worklist.pop() + common_prefix, common_postfix, members = None, None, [] + + # Ascending priority. If diff are pure numbers, allow it. If not, + # check if diff len is under limit. + checker_list = [self.__is_under_limit, self.__is_all_digits] + + # Step1: find a pair of common_prefix/postfix meets requirment. + while (checker_list + and (common_prefix, common_postfix) == (None, None)): + checker = checker_list.pop() + for string in worklist: + cprefix = os.path.commonprefix([string, specimen]) + + # get postfix for the rest part. + cpostfix = commonpostfix( + [string[len(cprefix):], specimen[len(cprefix):]]) + + # [begin, end) index of diff1/diff2. + begin1, end1 = len(cprefix), len(specimen) - len(cpostfix) + begin2, end2 = len(cprefix), len(string) - len(cpostfix) + + diff1, diff2 = specimen[begin1:end1], string[begin2:end2] + + if ((not self.__is_in_regex(specimen, begin1, end1)) + and (not self.__is_in_regex(string, begin2, end2)) + and checker(diff1, diff2)): + common_prefix, common_postfix = cprefix, cpostfix + changed = True + break + + # Step2: find members of this common_prefix/postfix. + if (common_prefix, common_postfix) != (None, None): + for string in worklist: + cprefix = os.path.commonprefix([string, specimen]) + cpostfix = commonpostfix( + [string[len(cprefix):], specimen[len(cprefix):]]) + begin1, end1 = len(cprefix), len(specimen) - len(cpostfix) + begin2, end2 = len(cprefix), len(string) - len(cpostfix) + diff1, diff2 = specimen[begin1:end1], string[begin2:end2] + + if ((common_prefix, common_postfix) == (cprefix, cpostfix) + and checker(diff1, diff2)): + members.append(string) + + # Step3: remove members from worklist. + for member in members: + worklist.remove(member) + members.append(specimen) + + # Step4: gen regex for members. + if len(members) == 1: + regex = members[0] + else: + diffs = [ + x[len(common_prefix):len(x) - len(common_postfix)] + for x in members + ] + need_question_mark = False + if '' in diffs: + need_question_mark = True + diffs = [x for x in diffs if x != ''] + if diffs: + diffs.sort(key=lambda x: (len(x), x)) + regex = '|'.join(diffs) + if len(diffs) > 1 or len(diffs[0]) > 1: + regex = '(' + regex + ')' + if need_question_mark: + regex = f'({regex}?)' + regex = common_prefix + regex + common_postfix + + regexes_out.append(regex) + + if changed: + assert (set(regexes_out) != set(regexes_in)) + return (regexes_out, changed) + + def reduce(self, regexes_in): + changed = True + last_regexes = regexes_in + + # Wordaround to prefer short prefix. + real_limit = self.diff_len_limit + if real_limit > 2: + self.diff_len_limit = 2 + while changed: + last_regexes, changed = self.reduce_once(last_regexes) + self.diff_len_limit = real_limit + changed = True + + while changed: + last_regexes, changed = self.reduce_once(last_regexes) + + # Validation. + for regex_in in regexes_in: + hit = 0 + for regex_out in last_regexes: + if re.match(f'^{regex_out}$', regex_in): + hit += 1 + assert hit == 1, f'{regex_in}, {regexes_in}' + + return last_regexes + + +if __name__ == '__main__': + + class UtilsChecker(unittest.TestCase): + def test_to_int(self): + self.assertEqual(to_int('1a'), None) + self.assertEqual(to_int('0x12', 16), 0x12) + self.assertEqual(to_int(4), 4) + self.assertEqual(to_int('12'), 12) + + def test_nums2str(self): + self.assertEqual(nums2str([1, 2, 3], 2, sep='_', prefix='GRTPort'), + 'GRTPort01_02_03') + + def test_str2nums(self): + self.assertEqual( + str2nums('GRTPort01_02_03', sep='_', prefix='GRTPort'), + [1, 2, 3]) + + def test_listdiff(self): + self.assertEqual(listdiff([1, 1, 2], [3, 2, 4, 2, 1]), + (1, 3, 4, 2)) + + def test_listremove(self): + self.assertEqual(listremove([1, 1, 2], [1]), [1, 2]) + + def test_listcontain(self): + self.assertTrue(listcontain([1, 1, 2], [1])) + self.assertFalse(listcontain([1, 1, 2], [3])) + + def test_regex_reducer(self): + self.assertEqual( + RegexReducer().reduce([ + 'ABS8ri8', 'ABS16ri8', 'ABS8mr', 'ABS32ri16', 'ABS32ri32', + 'ABS8x', 'ABS8f', 'ABS8i', 'ABS8', 'aes' + ]), + ['ABS(8|16)ri8', 'ABS8((f|i|x|mr)?)', 'ABS32ri(16|32)', 'aes']) + self.assertEqual( + RegexReducer(1).reduce([ + 'ABS8ri8', 'ABS16ri8', 'ABS8mr', 'ABS32ri16', 'ABS32ri32', + 'ABS8x', 'ABS8f', 'ABS8i', 'ABS8', 'aes' + ]), [ + 'ABS(8|16)ri8', 'ABS8mr', 'ABS32ri(16|32)', + 'ABS8((f|i|x)?)', 'aes' + ]) + self.assertEqual( + RegexReducer(0).reduce([ + 'ABS8ri8', 'ABS16ri8', 'ABS8mr', 'ABS32ri16', 'ABS32ri32', + 'ABS8x', 'ABS8f', 'ABS8i', 'ABS8', 'aes' + ]), [ + 'ABS(8|16)ri8', 'ABS8mr', 'ABS32ri(16|32)', 'ABS8x', + 'ABS8f', 'ABS8i', 'ABS8', 'aes' + ]) + self.assertEqual( + RegexReducer().reduce_once( + ['(V?)CVTTSS2SI64rr_Int', '(V?)CVTSS2SI64rr_Int'])[0], + ['(V?)CVT(T?)SS2SI64rr_Int']) + self.assertEqual( + RegexReducer(4).reduce([ + 'CVTSD2SIrm', 'CVTSD2SIrm_Int', 'VCVTSD2SIrm', + 'VCVTSD2SIrm_Int', 'CVTTSD2SIrm', 'CVTTSD2SIrm_Int', + 'VCVTTSD2SIrm_Int', 'VCVTTSD2SIrm' + ]), ['(V?)CVT(T?)SD2SIrm((_Int)?)']) + + unittest.main() diff --git a/llvm/utils/schedtool/llvm-patch/0001-Add-llvm-smv-tool-to-auto-generate-instruction-sched.patch b/llvm/utils/schedtool/llvm-patch/0001-Add-llvm-smv-tool-to-auto-generate-instruction-sched.patch new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/llvm-patch/0001-Add-llvm-smv-tool-to-auto-generate-instruction-sched.patch @@ -0,0 +1,215 @@ +From 41c5bef4f0a91d5cba759ce2339823fd0330016f Mon Sep 17 00:00:00 2001 +From: Haohai Wen +Date: Thu, 3 Jun 2021 13:03:46 +0800 +Subject: [PATCH 1/4] Add llvm-smv tool to auto generate instruction scheduler + info. + +Change-Id: Idb4e392883cdbb99a5b4a6a12b559e970f88b000 +--- + llvm/tools/llvm-smv/CMakeLists.txt | 16 +++ + llvm/tools/llvm-smv/llvm-smv.cpp | 170 +++++++++++++++++++++++++++++ + 2 files changed, 186 insertions(+) + create mode 100644 llvm/tools/llvm-smv/CMakeLists.txt + create mode 100644 llvm/tools/llvm-smv/llvm-smv.cpp + +diff --git a/llvm/tools/llvm-smv/CMakeLists.txt b/llvm/tools/llvm-smv/CMakeLists.txt +new file mode 100644 +index 000000000000..b869f5d4dc55 +--- /dev/null ++++ b/llvm/tools/llvm-smv/CMakeLists.txt +@@ -0,0 +1,16 @@ ++include_directories(include) ++ ++set(LLVM_LINK_COMPONENTS ++ AllTargetsDescs ++ AllTargetsInfos ++ MCA ++ MC ++ MCParser ++ Support ++ ) ++ ++add_llvm_tool(llvm-smv ++ llvm-smv.cpp ++ ) ++ ++set(LLVM_SMV_SOURCE_DIR ${CURRENT_SOURCE_DIR}) +diff --git a/llvm/tools/llvm-smv/llvm-smv.cpp b/llvm/tools/llvm-smv/llvm-smv.cpp +new file mode 100644 +index 000000000000..d6560ff276fc +--- /dev/null ++++ b/llvm/tools/llvm-smv/llvm-smv.cpp +@@ -0,0 +1,170 @@ ++#include "llvm/MC/MCRegisterInfo.h" ++#include "llvm/MC/MCSubtargetInfo.h" ++#include "llvm/MC/TargetRegistry.h" ++#include "llvm/MCA/InstrBuilder.h" ++#include "llvm/Support/CommandLine.h" ++#include "llvm/Support/ErrorHandling.h" ++#include "llvm/Support/ErrorOr.h" ++#include "llvm/Support/FileSystem.h" ++#include "llvm/Support/Host.h" ++#include "llvm/Support/InitLLVM.h" ++#include "llvm/Support/JSON.h" ++#include "llvm/Support/TargetSelect.h" ++#include "llvm/Support/ToolOutputFile.h" ++#include "llvm/Support/WithColor.h" ++ ++using namespace llvm; ++ ++static cl::OptionCategory ToolOptions("Tool Options"); ++static cl::opt ++ ArchName("march", ++ cl::desc("Target architecture. " ++ "See -version for available targets"), ++ cl::cat(ToolOptions)); ++ ++static cl::opt ++ TripleName("mtriple", ++ cl::desc("Target triple. See -version for available targets"), ++ cl::cat(ToolOptions)); ++ ++static cl::opt ++ MCPU("mcpu", ++ cl::desc("Target a specific cpu type (-mcpu=help for details)"), ++ cl::value_desc("cpu-name"), cl::cat(ToolOptions), cl::init("native")); ++ ++static cl::opt MATTR("mattr", ++ cl::desc("Additional target features."), ++ cl::cat(ToolOptions)); ++ ++static cl::opt OutputFilename("o", cl::desc("Output filename"), ++ cl::init("-"), cl::cat(ToolOptions), ++ cl::value_desc("filename")); ++ ++namespace { ++ ++const Target *getTarget(const char *ProgName) { ++ if (TripleName.empty()) ++ TripleName = Triple::normalize(sys::getDefaultTargetTriple()); ++ Triple TheTriple(TripleName); ++ ++ // Get the target specific parser. ++ std::string Error; ++ const Target *TheTarget = ++ TargetRegistry::lookupTarget(ArchName, TheTriple, Error); ++ if (!TheTarget) { ++ errs() << ProgName << ": " << Error; ++ return nullptr; ++ } ++ ++ // Update TripleName with the updated triple from the target lookup. ++ TripleName = TheTriple.str(); ++ ++ // Return the found target. ++ return TheTarget; ++} ++ ++ErrorOr> getOutputStream() { ++ if (OutputFilename == "") ++ OutputFilename = "-"; ++ std::error_code EC; ++ auto Out = std::make_unique(OutputFilename, EC, ++ sys::fs::OF_TextWithCRLF); ++ if (!EC) ++ return std::move(Out); ++ return EC; ++} ++} // end of anonymous namespace ++ ++int main(int argc, char **argv) { ++ InitLLVM X(argc, argv); ++ ++ // Initialize targets. ++ InitializeAllTargetInfos(); ++ InitializeAllTargetMCs(); ++ ++ // Enable printing of available targets when flag --version is specified. ++ cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); ++ ++ // Parse flags and initialize target options. ++ cl::ParseCommandLineOptions(argc, argv, ++ "llvm machine code performance analyzer.\n"); ++ ++ // Get the target from the triple. If a triple is not specified, then select ++ // the default triple for the host. If the triple doesn't correspond to any ++ // registered target, then exit with an error message. ++ const char *ProgName = argv[0]; ++ const Target *TheTarget = getTarget(ProgName); ++ if (!TheTarget) ++ return 1; ++ ++ // GetTarget() may replaced TripleName with a default triple. ++ // For safety, reconstruct the Triple object. ++ Triple TheTriple(TripleName); ++ ++ if (MCPU == "native") ++ MCPU = std::string(llvm::sys::getHostCPUName()); ++ ++ // Now initialize the output file. ++ auto OF = getOutputStream(); ++ if (std::error_code EC = OF.getError()) { ++ WithColor::error() << EC.message() << '\n'; ++ return 1; ++ } ++ std::unique_ptr TOF = std::move(*OF); ++ auto &OS = TOF->os(); ++ ++ std::unique_ptr STI( ++ TheTarget->createMCSubtargetInfo(TripleName, MCPU, MATTR)); ++ assert(STI && "Unable to create subtarget info!"); ++ if (!STI->isCPUStringValid(MCPU)) ++ return 1; ++ ++ std::unique_ptr MCII(TheTarget->createMCInstrInfo()); ++ assert(MCII && "Unable to create instruction info!"); ++ ++ const MCSchedModel &SM = STI->getSchedModel(); ++ json::Object JRoot; ++ ++ for (unsigned I = 1, E = MCII->getNumOpcodes(); I < E; ++I) { ++ const MCInstrDesc &MCID = MCII->get(I); ++ ++ // Then obtain the scheduling class information from the instruction. ++ unsigned SchedClassID = MCID.getSchedClass(); ++ bool IsVariant = SM.getSchedClassDesc(SchedClassID)->isVariant(); ++ ++ // Skip checking variant schedclass. ++ if (IsVariant) { ++ errs() << "unable to resolve scheduling class for write variant: " ++ << MCII->getName(I) << "\n"; ++ continue; ++ } ++ ++ // Skip checking unsupported instructions. ++ const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); ++ if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) { ++ errs() << "found an unsupported instruction: " << MCII->getName(I) ++ << "\n"; ++ continue; ++ } ++ ++ json::Object JInstInfo; ++ JInstInfo["NumUops"] = SCDesc.NumMicroOps; ++ JInstInfo["Latency"] = MCSchedModel::computeInstrLatency(*STI, SCDesc); ++ JInstInfo["RThroughput"] = ++ MCSchedModel::getReciprocalThroughput(*STI, SCDesc); ++ ++ json::Object JWriteProcRes; ++ const MCWriteProcResEntry *Idx = STI->getWriteProcResBegin(&SCDesc); ++ const MCWriteProcResEntry *End = STI->getWriteProcResEnd(&SCDesc); ++ for (; Idx != End; ++Idx) { ++ StringRef Name = SM.getProcResource(Idx->ProcResourceIdx)->Name; ++ JWriteProcRes[Name] = Idx->Cycles; ++ } ++ JInstInfo["WriteRes"] = std::move(JWriteProcRes); ++ JRoot[MCII->getName(I)] = std::move(JInstInfo); ++ } ++ ++ OS << formatv("{0:2}", json::Value(std::move(JRoot))); ++ TOF->keep(); ++ return 0; ++} +-- +2.18.1 + diff --git a/llvm/utils/schedtool/llvm-patch/0002-Support-debug-only-print-opcode-to-llvm-mc.patch b/llvm/utils/schedtool/llvm-patch/0002-Support-debug-only-print-opcode-to-llvm-mc.patch new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/llvm-patch/0002-Support-debug-only-print-opcode-to-llvm-mc.patch @@ -0,0 +1,66 @@ +From 4828401cf5c76198577d57afa24459d231a08662 Mon Sep 17 00:00:00 2001 +From: Haohai Wen +Date: Wed, 16 Mar 2022 15:23:14 +0800 +Subject: [PATCH 2/4] Support --debug-only=print-opcode to llvm-mc + +This option will emit all possbile matched opcodes. +--- + llvm/utils/TableGen/AsmMatcherEmitter.cpp | 25 ++++++++++++++++++++++- + 1 file changed, 24 insertions(+), 1 deletion(-) + +diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp +index 8f3c98b4303f..0b7d784844e2 100644 +--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp ++++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp +@@ -3547,7 +3547,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + // Finally, build the match function. + OS << "unsigned " << Target.getName() << ClassName << "::\n" + << "MatchInstructionImpl(const OperandVector &Operands,\n"; +- OS << " MCInst &Inst,\n"; ++ OS << " MCInst &GenuineInst,\n"; + if (ReportMultipleNearMisses) + OS << " SmallVectorImpl *NearMisses,\n"; + else +@@ -3634,9 +3634,16 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + OS << " if (MnemonicRange.first == MnemonicRange.second)\n"; + OS << " return Match_MnemonicFail;\n\n"; + ++ OS << " bool PrintOpcodes = false;\n"; ++ OS << " DEBUG_WITH_TYPE(\"print-opcode\", PrintOpcodes = true);\n"; ++ OS << " bool MatchOnce = false;\n"; ++ OS << " MCInst FakeInst(GenuineInst);\n"; ++ OS << " std::string MatchedOpcodes;\n"; + OS << " for (const MatchEntry *it = MnemonicRange.first, " + << "*ie = MnemonicRange.second;\n"; + OS << " it != ie; ++it) {\n"; ++ OS << " MCInst *RealInst = MatchOnce ? &FakeInst : &GenuineInst;\n"; ++ OS << " MCInst &Inst = *RealInst;\n"; + OS << " const FeatureBitset &RequiredFeatures = " + "FeatureBitsets[it->RequiredFeaturesIdx];\n"; + OS << " bool HasRequiredFeatures =\n"; +@@ -3964,6 +3971,22 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + OS << " DEBUG_WITH_TYPE(\n"; + OS << " \"asm-matcher\",\n"; + OS << " dbgs() << \"Opcode result: complete match, selecting this opcode\\n\");\n"; ++ OS << "\n"; ++ OS << " if (!MatchOnce) {\n"; ++ OS << " MatchOnce = true;\n"; ++ OS << " if (!PrintOpcodes)\n"; ++ OS << " return Match_Success;\n"; ++ OS << " MatchedOpcodes = MII.getName(it->Opcode).str();\n"; ++ OS << " } else {\n"; ++ OS << " FakeInst.clear();\n"; ++ OS << " MatchedOpcodes += std::string(\",\") + MII.getName(it->Opcode).str();\n"; ++ OS << " }\n"; ++ // OS << " return Match_Success;\n"; ++ OS << " }\n\n"; ++ ++ OS << " if (MatchOnce) {\n"; ++ OS << " if (PrintOpcodes)\n"; ++ OS << " outs() << MatchedOpcodes;\n"; + OS << " return Match_Success;\n"; + OS << " }\n\n"; + +-- +2.18.1 + diff --git a/llvm/utils/schedtool/llvm-patch/0003-Add-gen-x86-inst-sched-info-to-emit-x86-instruction-.patch b/llvm/utils/schedtool/llvm-patch/0003-Add-gen-x86-inst-sched-info-to-emit-x86-instruction-.patch new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/llvm-patch/0003-Add-gen-x86-inst-sched-info-to-emit-x86-instruction-.patch @@ -0,0 +1,505 @@ +From 775767b265c0376b43b960ab5465cb663bc33736 Mon Sep 17 00:00:00 2001 +From: Haohai Wen +Date: Wed, 16 Mar 2022 14:45:36 +0800 +Subject: [PATCH 3/4] Add --gen-x86-inst-sched-info to emit x86 instruction + sched info + +--- + llvm/utils/TableGen/AsmMatcherEmitter.cpp | 416 +++++++++++++++++++++- + 1 file changed, 411 insertions(+), 5 deletions(-) + +diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp +index 0b7d784844e2..7c2737fdf4b9 100644 +--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp ++++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp +@@ -110,11 +110,13 @@ + #include "llvm/Support/CommandLine.h" + #include "llvm/Support/Debug.h" + #include "llvm/Support/ErrorHandling.h" ++#include "llvm/Support/JSON.h" + #include "llvm/TableGen/Error.h" + #include "llvm/TableGen/Record.h" + #include "llvm/TableGen/StringMatcher.h" + #include "llvm/TableGen/StringToOffsetTable.h" + #include "llvm/TableGen/TableGenBackend.h" ++#include "X86RecognizableInstr.h" + #include + #include + #include +@@ -750,6 +752,13 @@ public: + /// Map of RegisterClass records to their class information. + std::map RegisterClassClasses; + ++ // All instructions as InstAlias result. ++ using AliasResultsSetTy = DenseSet; ++ std::unique_ptr AliasResultsSet; ++ ++ // All CodeGenOnly but matchable instructions. ++ SmallVector CodeGenOnlyInstrs; ++ + private: + /// Map of token to class information which has already been constructed. + std::map TokenClasses; +@@ -782,7 +791,7 @@ public: + RecordKeeper &Records); + + /// Construct the various tables used during matching. +- void buildInfo(); ++ void buildInfo(bool AllowX86NoPseudo=false); + + /// buildOperandMatchInfo - Build the necessary information to handle user + /// defined operand parsing methods. +@@ -1480,7 +1489,7 @@ void AsmMatcherInfo::buildOperandMatchInfo() { + } + } + +-void AsmMatcherInfo::buildInfo() { ++void AsmMatcherInfo::buildInfo(bool AllowX86NoPseudo) { + // Build information about all of the AssemblerPredicates. + const std::vector> + &SubtargetFeaturePairs = SubtargetFeatureInfo::getAll(Records); +@@ -1495,6 +1504,13 @@ void AsmMatcherInfo::buildInfo() { + bool ReportMultipleNearMisses = + AsmParser->getValueAsBit("ReportMultipleNearMisses"); + ++ // Collect all instruction alias results. ++ if (AllowX86NoPseudo && !AliasResultsSet) { ++ AliasResultsSet = std::make_unique(); ++ for (Record *InstAlias : Records.getAllDerivedDefinitions("InstAlias")) ++ AliasResultsSet->insert(CodeGenInstAlias(InstAlias, Target).ResultInst); ++ } ++ + // Parse the instructions; we need to do this first so that we can gather the + // singleton register classes. + SmallPtrSet SingletonRegisters; +@@ -1522,8 +1538,33 @@ void AsmMatcherInfo::buildInfo() { + continue; + + // Ignore "codegen only" instructions. +- if (CGI->TheDef->getValueAsBit("isCodeGenOnly")) +- continue; ++ // Collect X86 CodeGenOnly but not pseudo instructions. ++ if (CGI->TheDef->getValueAsBit("isCodeGenOnly")) { ++ if (!AllowX86NoPseudo || CGI->isPseudo) ++ continue; ++ ++ if (!CommentDelimiter.empty() && ++ StringRef(CGI->AsmString).contains(CommentDelimiter)) ++ continue; ++ ++ BitsInit *FormBits = CGI->TheDef->getValueAsBitsInit("FormBits"); ++ uint8_t Width = FormBits->getNumBits(); ++ uint8_t Form = 0, Mask = 1; ++ for (unsigned Index = 0; Index < Width; Index++) { ++ if (cast(FormBits->getBit(Index))->getValue()) ++ Form |= Mask; ++ Mask <<= 1; ++ } ++ ++ // Skip X86 pseudo and instructions as alias result. ++ if (Form == X86Local::Pseudo || AliasResultsSet->count(CGI)) ++ continue; ++ ++ if (CGI->TheDef->getValueAsBit("hasLockPrefix")) ++ continue; ++ ++ CodeGenOnlyInstrs.push_back(CGI); ++ } + + // Ignore instructions for different instructions + StringRef V = CGI->TheDef->getValueAsString("AsmVariantName"); +@@ -3207,7 +3248,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + + // Compute the information on the instructions to match. + AsmMatcherInfo Info(AsmParser, Target, Records); +- Info.buildInfo(); ++ Info.buildInfo(/*AllowX86NoPseudo=*/true); + + // Sort the instruction table using the partial order on classes. We use + // stable_sort to ensure that ambiguous instructions are still +@@ -3503,6 +3544,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + + OS << "} // end anonymous namespace\n\n"; + ++ OS << "static const DenseSet CodeGenOnlySet = {\n"; ++ for (const CodeGenInstruction *CGI : Info.CodeGenOnlyInstrs) ++ OS << " " << Target.getInstNamespace() << "::" << CGI->TheDef->getName() ++ << ",\n"; ++ OS << "};\n\n"; ++ + unsigned VariantCount = Target.getAsmParserVariantCount(); + for (unsigned VC = 0; VC != VariantCount; ++VC) { + Record *AsmVariant = Target.getAsmParserVariant(VC); +@@ -3660,6 +3707,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + OS << " bool MultipleInvalidOperands = false;\n"; + } + ++ OS << " if (!PrintOpcodes && CodeGenOnlySet.count(it->Opcode)) {\n"; ++ OS << " DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Skip CodeGenOnly\\n\");\n"; ++ OS << " continue;\n"; ++ OS << " }\n\n"; ++ + if (HasMnemonicFirst) { + OS << " // equal_range guarantees that instruction mnemonic matches.\n"; + OS << " assert(Mnemonic == it->getMnemonic());\n"; +@@ -4027,3 +4079,357 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { + + static TableGen::Emitter::OptClass + X("gen-asm-matcher", "Generate assembly instruction matcher"); ++ ++//===---------- Customization to emit x86 inst sched info ----------------===// ++ ++// Global index to avoid using same regs on an instruction. ++static int RegIdx = 0; ++ ++static std::string genRegisterEnum(CodeGenRegBank &RegBank, ++ const CGIOperandList::OperandInfo *OI) { ++ CodeGenRegisterClass *RC = nullptr; ++ StringRef PrintMethod = OI->PrinterMethodName; ++ if (OI->Rec->isSubClassOf("RegisterClass")) ++ RC = RegBank.getRegClass(OI->Rec); ++ else if (OI->Rec->isSubClassOf("RegisterOperand")) ++ RC = RegBank.getRegClass(OI->Rec->getValueAsDef("RegClass")); ++ else ++ llvm_unreachable("Unknown register class"); ++ ++ ArrayRef Members = RC->getMembers(); ++ const CodeGenRegister *Reg = Members[RegIdx++ % Members.size()]; ++ StringRef RegName = Reg->getName(); ++ std::string Prefix = "%"; ++ if (PrintMethod == "printOperand") { ++ return Prefix + RegName.lower(); ++ } else if (PrintMethod == "printSTiRegOperand") { ++ return Prefix + RegName.lower().substr(0, 2) + '(' + ++ RegName.lower().substr(2) + ')'; ++ } else if (PrintMethod == "printVKPair") { ++ if (RegName.compare_insensitive("k0_k1") == 0) ++ return Prefix + "k0"; ++ else if (RegName.compare_insensitive("k2_k3") == 0) ++ return Prefix + "k2"; ++ else if (RegName.compare_insensitive("k4_k5") == 0) ++ return Prefix + "k4"; ++ else if (RegName.compare_insensitive("k6_k7") == 0) ++ return Prefix + "k6"; ++ else ++ llvm_unreachable("Unknown k register"); ++ } else { ++ std::string ErrMsg = ++ std::string("Unknown print method: ") + PrintMethod.str(); ++ llvm_unreachable(ErrMsg.c_str()); ++ } ++} ++ ++static std::string genImmediateEnum(const CGIOperandList::OperandInfo *OI) { ++ unsigned ImmSize = OI->Rec->getValueAsDef("Type")->getValueAsInt("Size"); ++ StringRef ValueName = OI->Rec->getName(); ++ ++ // Rewrite immsize for some operand type. ++ if (ValueName == "i64u8imm" || ValueName == "i64i8imm" || ++ ValueName == "i32u8imm" || ValueName == "i32i8imm" || ++ ValueName == "i16u8imm" || ValueName == "i16i8imm") ++ ImmSize = 8; ++ else if (ValueName == "i64i32imm") ++ ImmSize = 32; ++ ++ switch (ImmSize) { ++ case 1: ++ return "$0x1"; ++ case 8: ++ return "$0x01"; ++ case 16: ++ return "$0x1234"; ++ case 32: ++ return "$0x12345678"; ++ case 64: ++ return "$0x123456789abcdef0"; ++ default: ++ llvm_unreachable("Unknown ImmSize"); ++ } ++} ++ ++static std::string genMemoryEnum(const CGIOperandList::OperandInfo *OI) { ++ StringRef ValueName = OI->Rec->getName(); ++ StringRef PrintMethod = OI->PrinterMethodName; ++ if (PrintMethod == "printbytemem") ++ return "(%esp)"; ++ else if (PrintMethod == "printwordmem") ++ return "(%esp)"; ++ else if (PrintMethod == "printdwordmem") ++ return "(%esp)"; ++ else if (PrintMethod == "printqwordmem") { ++ if (ValueName == "vx64mem" || ValueName == "vx64xmem") ++ return "(%esp, %xmm15, 2)"; ++ return "(%esp)"; ++ } else if (PrintMethod == "printxmmwordmem") { ++ if (ValueName == "vx128mem" || ValueName == "vx128xmem") ++ return "(%esp, %xmm15, 2)"; ++ if (ValueName == "vy128mem" || ValueName == "vy128xmem") ++ return "(%esp, %ymm15, 2)"; ++ return "(%esp)"; ++ } else if (PrintMethod == "printymmwordmem") { ++ if (ValueName == "vy256mem" || ValueName == "vy256xmem") ++ return "(%esp, %ymm15, 2)"; ++ if (ValueName == "vx256mem" || ValueName == "vx256xmem") ++ return "(%esp, %xmm15, 2)"; ++ if (ValueName == "vz256mem") ++ return "(%esp, %zmm15, 2)"; ++ return "(%esp)"; ++ } else if (PrintMethod == "printzmmwordmem") { ++ if (ValueName == "vy512xmem") ++ return "(%esp, %ymm15, 2)"; ++ if (ValueName == "vz512mem") ++ return "(%esp, %zmm15, 2)"; ++ if (ValueName == "i512mem_GR16") ++ return "(%si)"; ++ if (ValueName == "i512mem_GR64") ++ return "(%rsp)"; ++ return "(%esp)"; ++ } else if (PrintMethod == "printtbytemem") ++ return "(%esp)"; ++ else if (PrintMethod == "printMemReference") ++ return "(%esp)"; ++ else if (PrintMethod == "printSrcIdx8") ++ return "(%esi)"; ++ else if (PrintMethod == "printSrcIdx16") ++ return "(%esi)"; ++ else if (PrintMethod == "printSrcIdx32") ++ return "(%esi)"; ++ else if (PrintMethod == "printSrcIdx64") ++ return "(%esi)"; ++ else if (PrintMethod == "printDstIdx8") ++ return "%es:(%edi)"; ++ else if (PrintMethod == "printDstIdx16") ++ return "%es:(%edi)"; ++ else if (PrintMethod == "printDstIdx32") ++ return "%es:(%edi)"; ++ else if (PrintMethod == "printDstIdx64") ++ return "%es:(%edi)"; ++ else if (PrintMethod == "printMemOffs8") ++ return "0x01"; ++ else if (PrintMethod == "printMemOffs16") ++ return "0x1234"; ++ else if (PrintMethod == "printMemOffs32") ++ return "0x12345678"; ++ else if (PrintMethod == "printMemOffs64") ++ return "0x123456789abcdef0"; ++ else ++ llvm_unreachable("Unkonw memory print method!"); ++} ++ ++static std::string genPCRelMemoryEnum(const CGIOperandList::OperandInfo *OI) { ++ StringRef PrintMethod = OI->PrinterMethodName; ++ if (PrintMethod == "printPCRelImm") ++ return "0x12345678"; ++ else ++ llvm_unreachable("Unkonw memory print method!"); ++} ++ ++static std::string genRoundCtrlEnum(const CGIOperandList::OperandInfo *OI) { ++ StringRef PrintMethod = OI->PrinterMethodName; ++ if (PrintMethod == "printRoundingControl") ++ return "{rn-sae}"; ++ else ++ llvm_unreachable("Unkonw memory print method!"); ++} ++ ++static std::string genEnumByOperandInfo(CodeGenTarget &CGT, ++ const CGIOperandList::OperandInfo *OI) { ++ if (OI->OperandType == "MCOI::OPERAND_REGISTER") ++ return genRegisterEnum(CGT.getRegBank(), OI); ++ else if (OI->OperandType == "MCOI::OPERAND_IMMEDIATE") ++ return genImmediateEnum(OI); ++ else if (OI->OperandType == "MCOI::OPERAND_MEMORY") ++ return genMemoryEnum(OI); ++ else if (OI->OperandType == "MCOI::OPERAND_PCREL") ++ return genPCRelMemoryEnum(OI); ++ else if (OI->OperandType == "X86::OPERAND_ROUNDING_CONTROL") ++ return genRoundCtrlEnum(OI); ++ else { ++ StringRef ValueName = OI->Rec->getName(); ++ if (ValueName == "lea64mem" || ValueName == "lea64_32mem") ++ return genMemoryEnum(OI); ++ else { ++ PrintFatalError(OI->Rec->getLoc(), "Unknown operand!"); ++ } ++ } ++} ++ ++static void scanSchedRW(json::Object &JSchedRW, Record *SchedRW) { ++ JSchedRW["Name"] = SchedRW->getName(); ++ JSchedRW["Type"] = SchedRW->getType()->getAsString(); ++ if (SchedRW->isSubClassOf("WriteSequence")) { ++ json::Array Writes; ++ for (Record *SubWrite : SchedRW->getValueAsListOfDefs("Writes")) { ++ json::Object JSubWrite; ++ scanSchedRW(JSubWrite, SubWrite); ++ Writes.push_back(std::move(JSubWrite)); ++ } ++ JSchedRW["Writes"] = std::move(Writes); ++ JSchedRW["Repeat"] = SchedRW->getValueAsInt("Repeat"); ++ } ++} ++ ++static std::string genAsmEnum(const MatchableInfo *MI, CodeGenTarget &Target) { ++ // Reset Global index. ++ RegIdx = 1; ++ ++ StringMap OpName2Enum; ++ std::string AsmEnum(MI->Mnemonic.str()); ++ StringRef AsmString = MI->AsmString; ++ unsigned ASIdx = MI->Mnemonic.size(); ++ for (auto &Op : MI->AsmOperands) { ++ size_t TokIdx = AsmString.find(Op.Token, ASIdx); ++ assert(TokIdx != StringRef::npos && "Token not exist"); ++ AsmEnum += AsmString.substr(ASIdx, TokIdx - ASIdx); ++ ASIdx = TokIdx + Op.Token.size(); ++ if (Op.Class->Kind == ClassInfo::Token || Op.SingletonReg) { ++ AsmEnum += Op.Token.str(); ++ continue; ++ } ++ ++ std::string &OpEnum = OpName2Enum[Op.SrcOpName]; ++ if (!OpEnum.size()) { ++ const CGIOperandList::OperandInfo *OI = nullptr; ++ if (MI->DefRec.is()) { ++ for (auto &OpInfo : ++ MI->DefRec.get()->Operands) ++ if (OpInfo.Name == Op.OrigSrcOpName) { ++ OI = &OpInfo; ++ break; ++ } ++ } else { ++ const CodeGenInstAlias *CGA = ++ MI->DefRec.get(); ++ for (unsigned I = 0, E = CGA->ResultOperands.size(); I != E; I++) { ++ auto &RO = CGA->ResultOperands[I]; ++ if (RO.isRecord() && RO.getName() == Op.OrigSrcOpName) { ++ OI = &(MI->getResultInst() ++ ->Operands[CGA->ResultInstOperandIndex[I].first]); ++ break; ++ } ++ } ++ } ++ assert(OI && "Can't found OperandInfo"); ++ OpEnum = genEnumByOperandInfo(Target, OI); ++ if (OpEnum == "%cr1") ++ OpEnum = "%cr0"; ++ AsmEnum += OpEnum; ++ } ++ } ++ assert(ASIdx == AsmString.size()); ++ return AsmEnum; ++} ++ ++namespace llvm { ++ ++void EmitX86InstSchedInfo(RecordKeeper &RK, raw_ostream &OS) { ++ CodeGenTarget Target(RK); ++ if (Target.getInstNamespace() != "X86") ++ return; ++ ++ Record *AsmParser = Target.getAsmParser(); ++ assert(AsmParser->getValueAsBit("HasMnemonicFirst")); ++ ++ // Compute the information on the instructions to match. ++ AsmMatcherInfo Info(AsmParser, Target, RK); ++ Info.buildInfo(/*AllowX86NoPseudo=*/true); ++ ++ // Sort the instruction table using the partial order on classes. We use ++ // stable_sort to ensure that ambiguous instructions are still ++ // deterministically ordered. ++ llvm::stable_sort( ++ Info.Matchables, ++ [](const std::unique_ptr &a, ++ const std::unique_ptr &b) { return *a < *b; }); ++ ++ Record *AsmVariant = RK.getDef("ATTAsmParserVariant"); ++ AsmVariantInfo Variant; ++ Variant.RegisterPrefix = AsmVariant->getValueAsString("RegisterPrefix"); ++ Variant.TokenizingCharacters = ++ AsmVariant->getValueAsString("TokenizingCharacters"); ++ Variant.SeparatorCharacters = ++ AsmVariant->getValueAsString("SeparatorCharacters"); ++ Variant.BreakCharacters = AsmVariant->getValueAsString("BreakCharacters"); ++ Variant.Name = AsmVariant->getValueAsString("Name"); ++ Variant.AsmVariantNo = AsmVariant->getValueAsInt("Variant"); ++ ++ DenseMap CGI2MI; ++ for (auto &MI : Info.Matchables) { ++ const CodeGenInstruction *ResultInst = MI->getResultInst(); ++ if (MI->AsmVariantID == Variant.AsmVariantNo) ++ CGI2MI.insert(std::make_pair(ResultInst, MI.get())); ++ } ++ ++ json::Object JRoot; ++ DenseMap Opc2AsmEnum; ++ for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) { ++ if (!II->TheDef->getValueInit("SchedRW")->isComplete()) ++ continue; ++ ++ json::Object JInstInfo; ++ json::Array JSchedReads, JSchedWrites; ++ for (Record *SchedRW : II->TheDef->getValueAsListOfDefs("SchedRW")) { ++ if (SchedRW->isSubClassOf("SchedWrite")) { ++ json::Object JSchedWrite; ++ scanSchedRW(JSchedWrite, SchedRW); ++ JSchedWrites.push_back(std::move(JSchedWrite)); ++ } else if (SchedRW->isSubClassOf("SchedRead")) { ++ json::Object JSchedRead; ++ scanSchedRW(JSchedRead, SchedRW); ++ JSchedReads.push_back(std::move(JSchedRead)); ++ } else ++ llvm_unreachable("SchedRW should be SchedRead or SchedWrite"); ++ } ++ JInstInfo["SchedReads"] = std::move(JSchedReads); ++ JInstInfo["SchedWrites"] = std::move(JSchedWrites); ++ ++ if (CGI2MI.count(II)) { ++ const MatchableInfo *MI = CGI2MI[II]; ++ json::Array JMode = json::Array({64, 32, 16}); ++ for (auto *RF : MI->RequiredFeatures) { ++ StringRef ValueName = RF->TheDef->getName(); ++ if (ValueName == "In16BitMode") ++ JMode = json::Array({16}); ++ else if (ValueName == "In32BitMode") ++ JMode = json::Array({32}); ++ else if (ValueName == "In64BitMode") ++ JMode = json::Array({64}); ++ else if (ValueName == "Not16BitMode") ++ JMode = json::Array({64, 32}); ++ else if (ValueName == "Not64BitMode") ++ JMode = json::Array({32, 16}); ++ else ++ llvm_unreachable("Unknown required feature"); ++ } ++ ++ // In64BitMode in some X86 CG only instructions are intentionally omitted. ++ if (II->TheDef->getValue("hasREX_W") && ++ II->TheDef->getValueAsBit("hasREX_W")) ++ JMode = json::Array({64}); ++ ++ std::string AsmEnum; ++ StringRef Opcode = II->TheDef->getName(); ++ if (Opcode == "JCC_2" || Opcode == "JCC_4") ++ AsmEnum = Opc2AsmEnum["JCC_1"]; ++ else ++ AsmEnum = genAsmEnum(MI, Target); ++ Opc2AsmEnum[Opcode] = AsmEnum; ++ JInstInfo["Modes"] = std::move(JMode); ++ JInstInfo["AsmString"] = AsmEnum; ++ } ++ ++ JRoot[II->TheDef->getName()] = std::move(JInstInfo); ++ } ++ OS << formatv("{0:2}", json::Value(std::move(JRoot))); ++} ++ ++} // end namespace llvm ++ ++static TableGen::Emitter::Opt Y("gen-x86-inst-sched-info", EmitX86InstSchedInfo, ++ "Generate instruction schedreadwrite info for x86 backend"); ++ ++//===----------------------------------------------------------------------===// +-- +2.18.1 + diff --git a/llvm/utils/schedtool/llvm-patch/0004-Support-debug-only-print-opcode-for-llvm-mc-disassem.patch b/llvm/utils/schedtool/llvm-patch/0004-Support-debug-only-print-opcode-for-llvm-mc-disassem.patch new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/llvm-patch/0004-Support-debug-only-print-opcode-for-llvm-mc-disassem.patch @@ -0,0 +1,61 @@ +From 51712091c90c1e0e6b600aadc440750cb208b63a Mon Sep 17 00:00:00 2001 +From: Haohai Wen +Date: Wed, 23 Mar 2022 17:16:56 +0800 +Subject: [PATCH 4/4] Support -debug-only=print-opcode for llvm-mc disassembler + +--- + llvm/tools/llvm-mc/Disassembler.cpp | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp +index 7456a2f2c915..6efce1f57e70 100644 +--- a/llvm/tools/llvm-mc/Disassembler.cpp ++++ b/llvm/tools/llvm-mc/Disassembler.cpp +@@ -25,6 +25,8 @@ + #include "llvm/Support/SourceMgr.h" + #include "llvm/Support/raw_ostream.h" + #include "llvm/TargetParser/Triple.h" ++#include "llvm/MC/MCInstrInfo.h" ++#include "llvm/Support/Debug.h" + + using namespace llvm; + +@@ -35,7 +37,7 @@ static bool PrintInsts(const MCDisassembler &DisAsm, + const ByteArrayTy &Bytes, + SourceMgr &SM, raw_ostream &Out, + MCStreamer &Streamer, bool InAtomicBlock, +- const MCSubtargetInfo &STI) { ++ const MCSubtargetInfo &STI, const MCInstrInfo &MCII) { + ArrayRef Data(Bytes.first.data(), Bytes.first.size()); + + // Disassemble it to strings. +@@ -68,6 +70,7 @@ static bool PrintInsts(const MCDisassembler &DisAsm, + [[fallthrough]]; + + case MCDisassembler::Success: ++ DEBUG_WITH_TYPE("print-opcode", outs() << MCII.getName(Inst.getOpcode())); + Streamer.emitInstruction(Inst, STI); + break; + } +@@ -155,6 +158,9 @@ int Disassembler::disassemble(const Target &T, const std::string &Triple, + return -1; + } + ++ std::unique_ptr MCII(T.createMCInstrInfo()); ++ assert(MCII && "Unable to create instruction info!"); ++ + // Set up initial section manually here + Streamer.initSections(false, STI); + +@@ -194,7 +200,7 @@ int Disassembler::disassemble(const Target &T, const std::string &Triple, + + if (!ByteArray.first.empty()) + ErrorOccurred |= PrintInsts(*DisAsm, ByteArray, SM, Out, Streamer, +- InAtomicBlock, STI); ++ InAtomicBlock, STI, *MCII); + } + + if (InAtomicBlock) { +-- +2.18.1 + diff --git a/llvm/utils/schedtool/schedgen/schedgen.py b/llvm/utils/schedtool/schedgen/schedgen.py new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/schedgen/schedgen.py @@ -0,0 +1,423 @@ +import json, collections, sys + +import lib.target as target +import lib.utils as utils +from lib.info_parser import parse_llvm_instr_info +from lib.llvm_instr import * + + +class LLVMSchedGen: + def __init__(self, llvm_instrs, target_cpu): + self.target_cpu = target_cpu + self.llvm_instrs = llvm_instrs + self.clean_wrong_schedwrite() + self.infer_schedwrite_resources() + self.infer_schedwriteres() + self.validate_infered_resource() + self.tag_unsupported_schedwrite() + + def gen_scheduler(self, ostream): + self.emit_scheduler(ostream) + + def clean_wrong_schedwrite(self): + ''' Some schedwrites of instr are wrong which must be removed. ''' + for llvm_instr in self.llvm_instrs: + if not llvm_instr.has_uops_info(): + continue + instr_latency = llvm_instr.uops_info.latency + instr_ports = llvm_instr.uops_info.ports + instr_num_uops = llvm_instr.uops_info.num_uops + wrong_aux_schedwrites, wrong_writesequences = [], [] + for schedwrite in llvm_instr.schedwrites: + if schedwrite.is_aux(): + assert schedwrite.is_complete() + if (schedwrite.latency > instr_latency + or schedwrite.num_uops > instr_num_uops + or not utils.listcontain(instr_ports, + schedwrite.resources)): + wrong_aux_schedwrites.append(schedwrite) + elif type(schedwrite) is WriteSequence: + ext_latency, ext_num_uops, ext_ports = 0, 0, [] + for leaf_write in schedwrite.expand(): + if not leaf_write.is_complete(): + continue + ext_latency += leaf_write.latency + ext_num_uops += leaf_write.num_uops + ext_ports.extend(leaf_write.resources) + if (ext_latency > instr_latency + or ext_num_uops > instr_num_uops + or not utils.listcontain(instr_ports, ext_ports)): + wrong_writesequences.append(schedwrite) + + # Wrong aux schedwrite must be removed. + if len(wrong_aux_schedwrites): + llvm_instr.set_use_instrw(True) + for wrong_sw in wrong_aux_schedwrites: + llvm_instr.schedwrites.remove(wrong_sw) + + # Wrong writesequence is replaced to WriteZero. infer_schedwriteres + # will replace WriteZero to SchedWriteRes. + if len(wrong_writesequences): + llvm_instr.set_use_instrw(True) + for wrong_ws in wrong_writesequences: + llvm_instr.replace_or_add_schedrw(wrong_ws, + SchedWrite('WriteZero'), + not_null=True) + + def infer_schedwrite_resources(self): + ''' Infer resources, latency def for schedwrite. ''' + # Map from schedwrite to associated llvm_instrs. + sw2instrs = {} + for llvm_instr in self.llvm_instrs: + for schedwrite in llvm_instr.schedwrites: + sw2instrs.setdefault(schedwrite, []).append(llvm_instr) + + # TODO: resource_cycles is not derived. + for schedwrite, llvm_instrs in sw2instrs.items(): + if schedwrite.is_complete(): + continue + candidates = [] + for llvm_instr in llvm_instrs: + if not llvm_instr.has_uops_info(): + continue + dr_latency = llvm_instr.uops_info.latency + dr_num_uops = llvm_instr.uops_info.num_uops + dr_ports = llvm_instr.uops_info.ports + for instr_sw in llvm_instr.schedwrites: + if instr_sw == schedwrite: + continue + assert instr_sw.is_complete() and instr_sw.is_aux(), \ + f'[{schedwrite}, {instr_sw}] only 1 incompleted ' \ + f'schedwrite is allowed.' + dr_num_uops -= instr_sw.num_uops + dr_ports = utils.listremove(dr_ports, instr_sw.resources) + dr_ports = tuple(sorted(dr_ports)) + candidates.append((dr_latency, dr_num_uops, dr_ports)) + + # Pick up a choice for schedwrite. + choices = collections.Counter(candidates).most_common() + if len(choices): + for choice, cnt in choices: + # If latency, num_uops >= 0. + if choice[0] >= 0 and choice[1] >= 0: + best_choice = choice + break + else: + raise ValueError('Not find best choice.') + + dr_latency = best_choice[0] + dr_num_uops = best_choice[1] + dr_ports = best_choice[2] + + write = schedwrite + if type(schedwrite) is WriteSequence: + write = None + leaf_writes = schedwrite.expand() + for leaf_write in leaf_writes: + if leaf_write.is_complete(): + dr_latency -= leaf_write.latency + dr_num_uops -= leaf_write.num_uops + dr_ports = utils.listremove( + dr_ports, leaf_write.resources) + continue + assert write is None, (f'multi leaf schedwrite' + f'incompleted: {leaf_writes}') + write = leaf_write + dr_ports = tuple(sorted(dr_ports)) + + # Set all resource_cycles to 1 cycle for convenience. + write.set_resources(resources=dr_ports, + resource_cycles=(1, ) * len(dr_ports), + num_uops=dr_num_uops, + latency=dr_latency) + + def infer_schedwriteres(self): + for llvm_instr in self.llvm_instrs: + if not llvm_instr.has_uops_info(): + continue + dr_latency = llvm_instr.uops_info.latency + dr_num_uops = llvm_instr.uops_info.num_uops + dr_ports = llvm_instr.uops_info.ports + + old_schedwrite = None + for schedwrite in llvm_instr.schedwrites: + if schedwrite.is_aux(): + assert dr_latency >= schedwrite.latency + dr_num_uops -= schedwrite.num_uops + dr_ports = utils.listremove(dr_ports, schedwrite.resources) + else: + assert old_schedwrite is None + old_schedwrite = schedwrite + + if (old_schedwrite and old_schedwrite.latency == dr_latency + and old_schedwrite.num_uops == dr_num_uops + and utils.cmplist(old_schedwrite.resources, dr_ports)): + continue + + assert dr_num_uops >= 0 + dr_ports = tuple(sorted(dr_ports)) + schedwriteres = SchedWriteRes(resources=dr_ports, + resource_cycles=(1, ) * + len(dr_ports), + latency=dr_latency, + num_uops=dr_num_uops, + prefix=self.target_cpu.short_name) + llvm_instr.replace_or_add_schedrw(old_schedwrite, schedwriteres) + llvm_instr.set_use_instrw(True) + + def validate_infered_resource(self): + for llvm_instr in self.llvm_instrs: + if not llvm_instr.has_uops_info(): + continue + assert ( + llvm_instr.uops_info.latency == llvm_instr.compute_latency()) + assert ( + llvm_instr.uops_info.num_uops == llvm_instr.compute_num_uops()) + assert utils.cmplist(llvm_instr.uops_info.ports, + llvm_instr.compute_resources()) + + def tag_unsupported_schedwrite(self): + sw2instrs = {} + for llvm_instr in self.llvm_instrs: + for schedwrite in llvm_instr.schedwrites: + if type(schedwrite) is WriteSequence: + for leaf_write in schedwrite.expand(): + sw2instrs.setdefault(leaf_write, []).append(llvm_instr) + else: + sw2instrs.setdefault(schedwrite, []).append(llvm_instr) + + for schedwrite, llvm_instrs in sw2instrs.items(): + is_spt = len(llvm_instrs) == 0 + is_spt = is_spt or all(llvm_instr.isa_set is None + for llvm_instr in llvm_instrs) + is_spt = is_spt or not all(llvm_instr.isa_set is None or + llvm_instr.is_invalid(self.target_cpu) + for llvm_instr in llvm_instrs) + schedwrite.set_supported(is_spt) + + def emit_scheduler(self, ostream): + with open(self.target_cpu.template_td) as td: + ostream.write(td.read()) + ostream.write(f'\n//==={"-"*70}===//\n') + ostream.write('// The following definitons are infered by smg.\n') + ostream.write(f'//==={"-"*70}===//\n\n') + ostream.write('// Infered SchedWrite definition.\n') + + # Populate schedwrite and emit them. + lived_schedwrites = set() + for llvm_instr in self.llvm_instrs: + for instr_sw in llvm_instr.schedwrites: + if type(instr_sw) is WriteSequence: + for leaf_write in instr_sw.expand(): + assert type(leaf_write) is SchedWrite + lived_schedwrites.add(leaf_write) + elif type(instr_sw) is SchedWrite: + lived_schedwrites.add(instr_sw) + dead_schedwrites = tuple( + sorted(set(SchedWrite.get_all()) - lived_schedwrites)) + lived_schedwrites = collections.deque(sorted(lived_schedwrites)) + + while len(lived_schedwrites): + write = lived_schedwrites.popleft() + write_mem = SchedWrite.get(write.name + 'Ld') + writes = (write, ) + + if write_mem: + lived_schedwrites.remove(write_mem) + writes = (write, write_mem) + if all(not x.is_supported() for x in (write, write_mem)): + self.emit_write_res_pair_unsupported(ostream, write) + continue + + if all(x.is_complete() for x in (write, write_mem)): + if self.try_emit_write_res_pair(ostream, write, write_mem): + continue + + for schedwrite in writes: + if not schedwrite.is_supported(): + self.emit_write_res_unsupported(ostream, schedwrite) + elif not schedwrite.is_complete(): + ostream.write('// FIXME: Incompleted schedwrite.\n') + self.emit_write_res_unsupported(ostream, schedwrite) + else: + self.emit_write_res(ostream, schedwrite) + + if len(dead_schedwrites): + ostream.write('\n// Dead schedwrites that nobody uses.\n') + for dead_write in dead_schedwrites: + self.emit_write_res_unsupported(ostream, dead_write) + + # Group instrs which used InstRW based on schedrws. + schedrws2instrs = {} + for llvm_instr in self.llvm_instrs: + if not llvm_instr.use_instrw(): + continue + + # SchedWriteRes comes first, then SchedWrite, SchedRead. + schedrws = tuple( + sorted(llvm_instr.schedreads + llvm_instr.schedwrites, + key=lambda x: + (type(x) is SchedRead, type(x) is SchedWrite, type(x) is + SchedWriteRes, x.name))) + schedrws2instrs.setdefault(schedrws, []).append(llvm_instr) + + schedrws2instrs = dict( + sorted(schedrws2instrs.items(), key=lambda x: + (x[0][0], len(x[0])))) + + # Emit SchedWriteRes and InstRW. + ostream.write('\n// Infered SchedWriteRes and InstRW definition.\n') + emitted = set() + for schedrws, llvm_instrs in schedrws2instrs.items(): + for schedrw in schedrws: + if type(schedrw) is SchedWriteRes and schedrw not in emitted: + emitted.add(schedrw) + ostream.write('\n') + self.emit_schedwriteres(ostream, schedrw) + self.emit_instrw(ostream, schedrws, llvm_instrs) + + # Emit tailer bracket + ostream.write('\n}\n') + + def emit_write_res_pair_unsupported(self, ostream, schedwrite): + ostream.write( + f'defm : X86WriteResPairUnsupported<{schedwrite.name}>;\n') + + def emit_write_res_unsupported(self, ostream, schedwrite): + ostream.write(f'defm : X86WriteResUnsupported<{schedwrite.name}>;\n') + + def try_emit_write_res_pair(self, ostream, write_reg, write_mem): + ports_diff = utils.listdiff(write_reg.resources, write_mem.resources) + # Return false if ports_diff is empty or all diffs aren't load ports. + if len(ports_diff) == 0 or any(port != self.target_cpu.load_ports + for port in ports_diff): + return False + + num_loads = len(ports_diff) + if write_mem.num_uops - write_reg.num_uops != num_loads: + return False + + short_name = self.target_cpu.short_name + + res_defs = collections.Counter(write_reg.resources).items() + exe_ports = '[' + ', '.join( + self.target_cpu.get_ports_name(res[0]) for res in res_defs) + ']' + latstr = self.target_cpu.lat2str(write_reg.latency) + + load_lat = write_mem.latency - write_reg.latency + if load_lat < 0: + ostream.write('// Warning: negtive load latency.\n') + + ostream.write(f'defm : {short_name}WriteResPair<{write_reg.name}, ' + f'{exe_ports}, {latstr}') + tailer = '>;\n' + must_present = False + if num_loads != 1: + tailer = f', {num_loads}' + tailer + must_present = True + if must_present or load_lat != self.target_cpu.load_latency: + tailer = f', {load_lat}' + tailer + must_present = True + if must_present or write_reg.num_uops != 1: + tailer = f', {write_reg.num_uops}' + tailer + must_present = True + if must_present or write_reg.resource_cycles != [1]: + resource_cycles = '[' + ', '.join(str(res[1]) + for res in res_defs) + ']' + tailer = f', {resource_cycles}' + tailer + ostream.write(tailer) + return True + + def emit_write_res(self, ostream, schedwrite): + num_uops = schedwrite.num_uops + res_defs = collections.Counter(schedwrite.resources).items() + exe_ports = '[' + ', '.join( + self.target_cpu.get_ports_name(res[0]) for res in res_defs) + ']' + resource_cycles = tuple(res[1] for res in res_defs) + latstr = self.target_cpu.lat2str(schedwrite.latency) + + if num_uops != 1: + ostream.write( + f'defm : X86WriteRes<{schedwrite.name}, {exe_ports}, ' + f'{latstr}, {list(resource_cycles)}, {num_uops}>;\n') + else: + ostream.write(f'def : WriteRes<{schedwrite.name}, {exe_ports}>') + tailer = '' + if resource_cycles != (1, ) * len(resource_cycles): + tailer += f' let ResourceCycles = {list(resource_cycles)};\n' + if schedwrite.latency != 1: + tailer += f' let Latency = {latstr};\n' + if tailer: + tailer = ' {\n' + tailer + '}\n' + else: + tailer = ';\n' + ostream.write(tailer) + + def emit_schedwriteres(self, ostream, schedwriteres): + res_defs = collections.Counter(schedwriteres.resources).items() + exe_ports = '[' + ', '.join( + self.target_cpu.get_ports_name(res[0]) for res in res_defs) + ']' + resource_cycles = tuple(res[1] for res in res_defs) + latstr = self.target_cpu.lat2str(schedwriteres.latency) + + ostream.write(f'def {schedwriteres.name} : SchedWriteRes<{exe_ports}>') + tailer = '' + if resource_cycles != (1, ) * len(resource_cycles): + tailer += f' let ResourceCycles = {list(resource_cycles)};\n' + if schedwriteres.latency != 1: + tailer += f' let Latency = {latstr};\n' + if schedwriteres.num_uops != 1: + tailer += f' let NumMicroOps = {schedwriteres.num_uops};\n' + if tailer: + tailer = ' {\n' + tailer + '}\n' + else: + tailer = ';\n' + ostream.write(tailer) + + def emit_instrw(self, ostream, schedrws, llvm_instrs): + instrs_regexes, instrs_opcode = [], [] + + for expr in utils.RegexReducer(4).reduce( + [x.opcode for x in llvm_instrs]): + if any(char in expr for char in ('(', ')', '|', '?', '*')): + instrs_regexes.append(expr) + else: + instrs_opcode.append(expr) + + # Emit instregex. + if instrs_regexes: + header = 'def : InstRW<[' + ', '.join([x.name for x in schedrws + ]) + '], (instregex ' + ostream.write(header) + indent = False + for instrs_regex in instrs_regexes: + if indent: + ostream.write(',\n' + ' ' * len(header)) + else: + indent = True + ostream.write(f'"^{instrs_regex}$"') + ostream.write(')>;\n') + + # Emit instrs. + if instrs_opcode: + header = 'def : InstRW<[' + ', '.join([x.name for x in schedrws + ]) + '], (instrs ' + ostream.write(header) + indent = False + for opcode in instrs_opcode: + if indent: + ostream.write(',\n' + ' ' * len(header)) + else: + indent = True + ostream.write(f'{opcode}') + ostream.write(')>;\n') + + +def main(args): + target_cpu = target.get_target(args.target_cpu) + with open(args.jf) as jf: + llvm_instrs = parse_llvm_instr_info(json.load(jf), target_cpu) + + ostream = sys.stdout if args.o == '-' else open(args.o, 'w') + LLVMSchedGen(llvm_instrs, target_cpu).gen_scheduler(ostream) + ostream.close() diff --git a/llvm/utils/schedtool/schedver/schedver.py b/llvm/utils/schedtool/schedver/schedver.py new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/schedver/schedver.py @@ -0,0 +1,56 @@ +import sys, os, json, subprocess +from lib import target +from lib.info_parser import parse_smv_instr_info, parse_llvm_instr_info +from lib.llvm_instr import * + + +def get_smv_instrs(target_cpu): + smv_instrs_json = subprocess.run( + f'llvm-smv -mcpu={target_cpu.proc_name}', + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL).stdout.decode('utf-8') + return parse_smv_instr_info(json.loads(smv_instrs_json), target_cpu) + + +class LLVMSchedVerifier: + def __init__(self, llvm_instrs, target_cpu): + self.target_cpu = target_cpu + self.llvm_instrs = llvm_instrs + self.smv_instrs = get_smv_instrs(target_cpu) + + def run(self): + opc2smv_instrs = { + smv_instr.opcode: smv_instr + for smv_instr in self.smv_instrs + } + + for llvm_instr in self.llvm_instrs: + if (not llvm_instr.has_uops_info() + or llvm_instr.is_invalid(self.target_cpu)): + continue + + uops_info = llvm_instr.uops_info + smv_instr = opc2smv_instrs[llvm_instr.opcode] + assert uops_info.latency == smv_instr.latency + assert uops_info.num_uops == smv_instr.num_uops + + # FIXME: We assume each uops consumes only 1 cycles. + res_cycles = { + ports: cycles + for ports, cycles in zip(smv_instr.resources, + smv_instr.resource_cycles) + } + assert set(smv_instr.resources) == set(uops_info.ports) + for ports in uops_info.ports: + res_cycles[ports] -= 1 + assert all(cycs == 0 for cycs in res_cycles.values()) + print('Pass') + + +def main(args): + target_cpu = target.get_target(args.target_cpu) + with open(args.jf) as jf: + llvm_instrs = parse_llvm_instr_info(json.load(jf), target_cpu) + LLVMSchedVerifier(llvm_instrs, target_cpu).run() diff --git a/llvm/utils/schedtool/smg b/llvm/utils/schedtool/smg new file mode 100755 --- /dev/null +++ b/llvm/utils/schedtool/smg @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import argparse +from schedgen import schedgen +from schedver import schedver + + +def parse_command_line(): + parser = argparse.ArgumentParser( + description='llvm schedule model generator.') + subparsers = parser.add_subparsers(dest='command') + generator_parser = subparsers.add_parser('gen', + description='generate schedmodel') + generator_parser.add_argument('--target-cpu', + required=True, + help='target cpu') + generator_parser.add_argument('-o', default='-', help='output file') + generator_parser.add_argument('jf', help='instruction uops info json file') + + verifier_parser = subparsers.add_parser('verify', + description='verify schedmodel') + verifier_parser.add_argument('--target-cpu', + required=True, + help='target cpu') + verifier_parser.add_argument('jf', help='instruction uops info json file') + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_command_line() + if args.command == 'gen': + schedgen.main(args) + elif args.command == 'verify': + schedver.main(args) diff --git a/llvm/utils/schedtool/tools/add_adl_p_uopsinfo.py b/llvm/utils/schedtool/tools/add_adl_p_uopsinfo.py new file mode 100755 --- /dev/null +++ b/llvm/utils/schedtool/tools/add_adl_p_uopsinfo.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +import argparse, json, sys, subprocess, math +from collections import Counter +from multiprocessing import Pool + + +def parse_command_line(): + parser = argparse.ArgumentParser( + description='Add alderlake-p/sapphirerapids uops info from intel doc.') + parser.add_argument('-o', default='-', help='output file') + parser.add_argument('--overwrite', + default=False, + action='store_true', + help='Overwrite info if it existed') + parser.add_argument('--jf', + default='-', + help='instruction sched info json file') + parser.add_argument('--adl-p-json', + '--spr-json', + required=True, + help='alderlake-p/sapphirerapids tpt lat json file') + return parser.parse_args() + + +def duops2ports(duops): + uops_info = [] + for ports_desc, num_uops in Counter(item['ports'] + for item in duops).items(): + if ports_desc != '': + ports = [int(i, 16) for i in ports_desc] + uops_info.append([num_uops, ports]) + return uops_info + + +def disassemble(encode): + blocks = [] + for i in range(math.ceil(len(encode) / 2)): + blocks.append(f'0x{encode[2*i:2*i+2]}') + formatted_encode = ','.join(blocks) + triples = ('x86_64', 'i386', 'i686-linux-gnu-code16') + for triple in triples: + cmd = (f"echo -e '{formatted_encode}' | llvm-mc --disassemble " + f"--triple={triple} --debug-only=print-opcode -o /dev/null") + result = subprocess.run(cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + parsed_opcode = result.stdout.decode('utf-8') + if parsed_opcode != '': + return encode, parsed_opcode + return encode, None + + +if __name__ == '__main__': + args = parse_command_line() + ostream = sys.stdout if args.o == '-' else open(args.o, 'w') + istream = sys.stdin if args.jf == '-' else open(args.jf, 'r') + + encode2uopsinfo = {} + with open(args.adl_p_json) as adl_p_json: + for info in json.load(adl_p_json): + encode = info['uniq_key'] + assert encode not in encode2uopsinfo + if (len(info.get('duops', [])) == 0 + or all(item['ports'] == '' for item in info['duops'])): + continue + + ports = duops2ports(info['duops']) + est_uops = sum(item[0] for item in ports) + uops = int(info.get('uops_number', est_uops)) + if uops < est_uops: + print(f"{uniq_key} :", + f'uops derived from ports ({est_uops}) > ' + f'uops listed ({uops}), use derived one', + file=sys.stderr) + uops = est_uops + tp = float(info['throughput']) if 'throughput' in info else None + latency = (int(float(info['latency'])) + if 'latency' in info else None) + entry = {} + for name, value in zip(('Port', 'Uops', 'Tp', 'Latency'), + (ports, uops, tp, latency)): + if value is not None: + entry[name] = value + encode2uopsinfo[encode] = entry + + with Pool() as pool: + result = pool.map(disassemble, encode2uopsinfo.keys()) + + sig_name = 'hw-adl' + instr_sched_info = json.load(istream) + for encode, parsed_opcode in result: + if parsed_opcode is None: + continue + sched_info = instr_sched_info[parsed_opcode] + if 'XedInfo' not in sched_info: + continue + uops_info = encode2uopsinfo[encode] + for key, value in uops_info.items(): + if args.overwrite or key not in sched_info: + sched_info[key] = value + assert sched_info.get(f'{key}Sig', None) != sig_name + sched_info[f'{key}Sig'] = 'hw-adl' + # if port is updated then uops must be consistent with port. + if sched_info['PortSig'] == sig_name: + sched_info['Uops'] = uops_info['Uops'] + sched_info['UopsSig'] = sig_name + + json.dump(instr_sched_info, ostream, indent=2) + istream.close() + ostream.close() diff --git a/llvm/utils/schedtool/tools/add_smv_uopsinfo.py b/llvm/utils/schedtool/tools/add_smv_uopsinfo.py new file mode 100755 --- /dev/null +++ b/llvm/utils/schedtool/tools/add_smv_uopsinfo.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 + +import argparse, json, sys, os, re + +# Add parent dir to path. +sys.path.append(f'{os.path.dirname(os.path.realpath(__file__))}/..') + +from schedver.schedver import get_smv_instrs +from lib import target +from lib.llvm_instr import Port + + +def parse_command_line(): + parser = argparse.ArgumentParser(description='Add llvm-smv uops info.') + parser.add_argument('-o', default='-', help='output file') + parser.add_argument('--ref-cpu', required=True, help='reference cpu') + parser.add_argument('--target-cpu', required=True, help='target cpu') + parser.add_argument('--overwrite', + default=False, + action='store_true', + help='Overwrite info if it existed') + parser.add_argument('--jf', + default='-', + help='instruction sched info json file') + return parser.parse_args() + + +def map_resources(opcode, ref_resources, ref_cpu, target_cpu): + target_resources = [] + if (isinstance(ref_cpu, target.SkylakeServer) + and isinstance(target_cpu, target.SapphireRapids)): + for res in ref_resources: + if res == ref_cpu.load_ports: + target_resources.append(target_cpu.load_ports) + elif res == Port.gets((2, 3, 7)): + target_resources.append(Port.gets((7, 8))) + elif res == Port.gets((4, )): + target_resources.append(Port.gets((4, 9))) + elif (res == Port.gets((0, 1, 5, 6)) + and re.match(r'^(ADD|SUB|XOR|AND|OR)\d', opcode)): + target_resources.append(Port.gets((0, 1, 5, 6, 10))) + target_resources.append(res) + return tuple(target_resources) + elif (isinstance(ref_cpu, target.IcelakeServer) + and isinstance(target_cpu, target.SapphireRapids)): + for res in ref_resources: + if res == ref_cpu.load_ports: + target_resources.append(target_cpu.load_ports) + elif res == Port.gets((2, 3, 7)): # STA from SKX + target_resources.append(Port.gets((7, 8))) + elif res == Port.gets((4, )): # STD from SKX + target_resources.append(Port.gets((4, 9))) + elif (res == Port.gets((0, 1, 5, 6)) + and re.match(r'^(ADD|SUB|XOR|AND|OR)\d', opcode)): + target_resources.append(Port.gets((0, 1, 5, 6, 10))) + target_resources.append(res) + return tuple(target_resources) + elif (isinstance(ref_cpu, target.Skylake) + and isinstance(target_cpu, target.AlderlakeP)): + for res in ref_resources: + if res == ref_cpu.load_ports: + target_resources.append(target_cpu.load_ports) + elif res == Port.gets((2, 3, 7)): + target_resources.append(Port.gets((7, 8))) + elif res == Port.gets((4, )): + target_resources.append(Port.gets((4, 9))) + elif (res == Port.gets((0, 1, 5, 6)) + and re.match(r'^(ADD|SUB|XOR|AND|OR)\d', opcode)): + target_resources.append(Port.gets((0, 1, 5, 6, 10))) + target_resources.append(res) + return tuple(target_resources) + else: + raise NotImplementedError( + f'Unknown resources map between ' + f'{ref_cpu.proc_name} and {target_cpu.proc_name}') + + +if __name__ == '__main__': + args = parse_command_line() + istream = sys.stdin if args.jf == '-' else open(args.jf, 'r') + ostream = sys.stdout if args.o == '-' else open(args.o, 'w') + + ref_cpu = target.get_target(args.ref_cpu) + target_cpu = target.get_target(args.target_cpu) + instr_sched_info = json.load(istream) + for smv_instr in get_smv_instrs(ref_cpu): + # FIXME: we assume each uop only consume 1 cycle. + ports = [] + opcode = smv_instr.opcode + for resources, cycles in zip( + map_resources(opcode, smv_instr.resources, ref_cpu, + target_cpu), smv_instr.resource_cycles): + ports.append([cycles, [int(str(p)) for p in resources]]) + sig_name = f'smv.{ref_cpu.proc_name}' + uops = smv_instr.num_uops + tp = smv_instr.throughput + latency = smv_instr.latency + uops_info = {'Port': ports, 'Uops': uops, 'Tp': tp, 'Latency': latency} + info = instr_sched_info[smv_instr.opcode] + if 'XedInfo' not in info: + continue + for key, value in uops_info.items(): + # Only add smv uops info to instruction with iform. + if args.overwrite or key not in info: + info[key] = value + assert info.get(f'{key}Sig', None) != sig_name + info[f'{key}Sig'] = sig_name + # if port is updated then uops must be consistent with port. + if info['PortSig'] == sig_name: + info['Uops'] = uops_info['Uops'] + info['UopsSig'] = sig_name + + json.dump(instr_sched_info, ostream, indent=2) + istream.close() + ostream.close() diff --git a/llvm/utils/schedtool/tools/add_spr_uopsinfo.py b/llvm/utils/schedtool/tools/add_spr_uopsinfo.py new file mode 120000 --- /dev/null +++ b/llvm/utils/schedtool/tools/add_spr_uopsinfo.py @@ -0,0 +1 @@ +add_adl_p_uopsinfo.py \ No newline at end of file diff --git a/llvm/utils/schedtool/tools/add_uops_uopsinfo.py b/llvm/utils/schedtool/tools/add_uops_uopsinfo.py new file mode 100755 --- /dev/null +++ b/llvm/utils/schedtool/tools/add_uops_uopsinfo.py @@ -0,0 +1,218 @@ +#!/bin/python3 + +import argparse, json, sys, re +import xml.etree.ElementTree as ET + + +def parse_command_line(): + parser = argparse.ArgumentParser( + description='llvm schedule model generator.') + parser.add_argument('-o', default='-', help='output file') + parser.add_argument('--jf', + default='-', + help='instruction sched info json file') + parser.add_argument('--arch-name', required=True, help='architecture name') + parser.add_argument('--overwrite', + default=False, + action='store_true', + help='Overwrite info if it existed') + parser.add_argument('--inst-xml', + required=True, + help='uops.info instructions.xml file') + parser.add_argument('--debug', + default=False, + action='store_true', + help='Print debug info') + return parser.parse_args() + + +print('Warning: port 10 and port 11 are reversed on uops.info.', + "Let's swap them.", + file=sys.stderr) + + +# TODO: Update this method if uops.info changes ports representation. +def format_ports(ports_str): + uops_info = [] + for uops_desc in ports_str.split('+'): + num_uops, ports_desc = uops_desc.split('*') + num_uops = int(num_uops) + assert ports_desc[0] == 'p' + ports = [int(i, 16) for i in ports_desc[1:]] + + # FIXME: Remove this code once uops.info reverse pA and pB. + for i in range(len(ports)): + if ports[i] == 10: + ports[i] = 11 + elif ports[i] == 11: + ports[i] = 10 + + uops_info.append([num_uops, ports]) + return uops_info + + +class XmlInstrInfo: + def __init__(self, attrib): + self.attrib = attrib + self.xml_operands_info = [] + self.xml_uops_info = None + + +if __name__ == '__main__': + args = parse_command_line() + ostream = sys.stdout if args.o == '-' else open(args.o, 'w') + istream = sys.stdin if args.jf == '-' else open(args.jf, 'r') + + iform2xml_instr_infos = {} + root = ET.parse(args.inst_xml).getroot() + for extension in root: + for instruction in extension: + iform = instruction.attrib['iform'] + iclass = instruction.attrib['iclass'] + extension = instruction.attrib['extension'] + xml_instr_info = XmlInstrInfo(instruction.attrib) + iform2xml_instr_infos.setdefault(iform, []).append(xml_instr_info) + for instr_info in instruction: + if instr_info.tag == 'operand': + xml_instr_info.xml_operands_info.append(instr_info.attrib) + continue + + if (instr_info.tag != 'architecture' + or instr_info.attrib['name'] != args.arch_name): + continue + + for perf_info in instr_info: + if perf_info.tag != 'measurement': + continue + + uops = int(perf_info.attrib['uops']) + if uops > 1000000: + print(f'Skip invalid info :{perf_info.attrib}', + file=sys.stderr) + continue + + ports = perf_info.attrib.get('ports', None) + if ports is not None: + ports = format_ports(ports) + est_uops = sum(item[0] for item in ports) + if uops < est_uops: + print(f"{instruction.attrib['string']} :", + f'uops derived from ports ({est_uops}) > ' + f'uops measured ({uops}), use derived one', + file=sys.stderr) + uops = est_uops + if uops == 0: + assert not ports + ports = [] + + tp = min(float(perf_info.attrib['TP_unrolled']), + float(perf_info.attrib['TP_loop'])) + latency = -1 + for child in perf_info: + for key, val in child.attrib.items(): + if not re.match(r'^cycles((_)|(\w+))*$', key): + continue + latency = max(latency, int(val)) + if latency == -1: + latency = None + + entry = {} + for name, value in zip(('Port', 'Uops', 'Tp', 'Latency'), + (ports, uops, tp, latency)): + if value is not None: + entry[name] = value + assert xml_instr_info.xml_uops_info is None + xml_instr_info.xml_uops_info = entry + + # Find the suitable uops info. + instr_sched_info = json.load(istream) + for opcode, info in instr_sched_info.items(): + xed_info = info.get('XedInfo', None) + if xed_info is None: + continue + iform = xed_info['IForm'] + if iform not in iform2xml_instr_infos: + continue + + AsmString = info['AsmString'].split('\n')[-1] + has_same_num_opds = lambda xml_instr_info: (len( + xml_instr_info.xml_operands_info) == len(xed_info['OpdsInfo'])) + has_same_eosz = lambda xml_instr_info: (int( + xml_instr_info.attrib.get('eosz', -1)) == xed_info['EOSZ']) + has_same_names = lambda xml_instr_info: ([ + x.get('name', 'UnknowName') + for x in xml_instr_info.xml_operands_info + ] == [x['Name'] for x in xed_info['OpdsInfo']]) + has_same_xtypes = lambda xml_instr_info: ([ + x.get('xtype', 'UnknowXType') + for x in xml_instr_info.xml_operands_info + ] == [x['XType'] for x in xed_info['OpdsInfo']]) + has_same_widths = lambda xml_instr_info: ([ + int(x.get('width', -1)) for x in xml_instr_info.xml_operands_info + ] == [x['Width'] for x in xed_info['OpdsInfo']]) + has_same_zeroing = lambda xml_instr_info: (bool( + int(xml_instr_info.attrib.get('zeroing', 0))) == + ('{z}' in AsmString)) + has_same_mask = lambda xml_instr_info: (bool( + int(xml_instr_info.attrib.get('mask', 0))) == bool( + re.search(r'{%k[0-7]}', AsmString))) + has_same_sae = lambda xml_instr_info: (bool( + int(xml_instr_info.attrib.get('sae', 0))) == bool( + re.search(r'{(r(n|d|u|z)-)?sae}', AsmString))) + has_same_roundc = lambda xml_instr_info: (bool( + int(xml_instr_info.attrib.get('roundc', 0))) == bool( + re.search(r'{r(n|d|u|z)-sae}', AsmString))) + no_imm_zero = lambda xml_instr_info: (int( + xml_instr_info.attrib.get('immzero', 0)) == 0) + + def has_same_bcst(xml_instr_info): + xml_match = re.search(r'_(\d+to\d+)', + xml_instr_info.attrib['string']) + asm_match = re.search(r'{(\d+to\d+)}', AsmString) + if xml_match == asm_match: + return True + if xml_match is None or asm_match is None: + return False + return xml_match.group(1) == asm_match.group(1) + + # High priority comes first. + sort_key = lambda xml_instr_info: ( + no_imm_zero(xml_instr_info), + has_same_eosz(xml_instr_info), + has_same_zeroing(xml_instr_info), + has_same_mask(xml_instr_info), + has_same_sae(xml_instr_info), + has_same_roundc(xml_instr_info), + has_same_bcst(xml_instr_info), + has_same_xtypes(xml_instr_info), + has_same_widths(xml_instr_info), + has_same_num_opds(xml_instr_info), + has_same_names(xml_instr_info), + ) + iform2xml_instr_infos[iform].sort(key=sort_key, reverse=True) + + if args.debug: + print(opcode, '-' * 80) + for xml_instr_info in iform2xml_instr_infos[iform]: + print(xml_instr_info.attrib) + print(xml_instr_info.xml_uops_info) + for opi in xml_instr_info.xml_operands_info: + print(' ', opi) + print('') + + sig_name = f'uops.info.{args.arch_name}' + uops_info = iform2xml_instr_infos[iform][0].xml_uops_info + if uops_info is not None: + for key, value in uops_info.items(): + if args.overwrite or key not in info: + info[key] = value + assert info.get(f'{key}Sig', None) != sig_name + info[f'{key}Sig'] = sig_name + # if port is updated then uops must be consistent with port. + if info['PortSig'] == sig_name: + info['Uops'] = uops_info['Uops'] + info['UopsSig'] = sig_name + + json.dump(instr_sched_info, ostream, indent=2) + istream.close() + ostream.close() diff --git a/llvm/utils/schedtool/tools/add_xed_info.py b/llvm/utils/schedtool/tools/add_xed_info.py new file mode 100755 --- /dev/null +++ b/llvm/utils/schedtool/tools/add_xed_info.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 + +import argparse, json, subprocess, sys, re, shutil +from multiprocessing import Pool + + +def parse_command_line(): + parser = argparse.ArgumentParser( + description='llvm schedule model generator.') + parser.add_argument('-o', default='-', help='output file') + parser.add_argument('--xed', help='xed path') + parser.add_argument('--jf', + default='-', + help='instruction sched info json file') + return parser.parse_args() + + +ignore_opcode_list = { + 'MOV64ao32': 'MOV64rm', + 'MOV64o32a': 'MOV64mr', +} + +invalid_opcode_list = ['INVLPGB32', 'LOCK_PREFIX'] + + +def fix_asm(opcode, asm_string, modes): + vex2_asm_string = f'{{VEX2}} {asm_string}' + vex3_asm_string = f'{{VEX3}} {asm_string}' + evex_asm_string = f'{{EVEX}} {asm_string}' + cmd_template = ("echo -e '{assembly}'" + "| llvm-mc --debug-only=print-opcode -o /dev/null") + + parsed_opcodes, best_parsed_opcodes, best_asm = None, None, None + for mode in modes + [None]: + asms = [vex2_asm_string, evex_asm_string, asm_string, vex3_asm_string] + for asm in asms: + if mode is not None: + asm = f'.code{mode}\n{asm}' + cmd = cmd_template.format(assembly=asm) + try: + result = subprocess.run(cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + parsed_opcodes = result.stdout.decode('utf-8').split(',') + except: + continue + else: + if opcode in parsed_opcodes: + if len(parsed_opcodes) == 1: + return opcode, asm + + if (best_parsed_opcodes is None + or len(parsed_opcodes) < len(best_parsed_opcodes)): + best_parsed_opcodes = parsed_opcodes + best_asm = asm + elif ignore_opcode_list.get(opcode, None) in parsed_opcodes: + return opcode, asm + + if best_parsed_opcodes is not None: + return opcode, best_asm + else: + print(f"{modes}{cmd}\n'{opcode}': '{parsed_opcodes}',", + file=sys.stderr) + return opcode, asm_string + + +def encode_asm(opcode, asm_string): + result = None + + # Try to match not CodeGenOnly opcode. + try: + cmd = f"echo -e '{asm_string}' | llvm-mc --show-encoding" + result = subprocess.run(cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + # Try to match CodeGenOnly opcodes. + except subprocess.CalledProcessError: + cmd = f"echo -e '{asm_string}' |" \ + "llvm-mc --show-encoding -debug-only=print-opcode" + result = subprocess.run(cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + + output = result.stdout.decode('utf-8').split('\n') + for line in output: + line = line.strip() + match = re.match('.*# encoding: \[(.*)\]', line) + if match: + encoding = match.group(1).split(',') + break + encoding_str = '' + for byte in encoding: + encoding_str += f'{int(byte, 16):02x}' + return (opcode, encoding_str) + + +def get_xed_info(opcode, encoding, mode): + xed = args.xed or 'xed' + assert shutil.which(xed) is not None, f'{xed} not found' + + result = None + for m in [mode, 64, 32, 16]: + try: + cmd = f'{xed} -{m} -v 4 -d "{encoding}"' + result = subprocess.run(cmd, + shell=True, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + output = output.split('\n') + operands_info = [] + line_no = 3 + while not output[line_no].startswith('EOSZ:'): + opi, infos = output[line_no].split() + assert int(opi) == len(operands_info) + infos = infos.split('/') + operands_info.append({ + 'Name': infos[0], + 'XType': infos[-2].lower(), + 'Width': int(infos[-1]), + }) + line_no += 1 + + eosz = int(re.match('EOSZ:\s*(.*)', output[line_no]).group(1)) + iclass = re.match('ICLASS:\s*(.*)', output[line_no + 2]).group(1) + category = re.match('CATEGORY:\s*(.*)', + output[line_no + 3]).group(1) + extension = re.match('EXTENSION:\s*(.*)', + output[line_no + 4]).group(1) + iform = re.match('IFORM:\s*(.*)', output[line_no + 5]).group(1) + isa_set = re.match('ISA_SET:\s*(.*)', output[line_no + 6]).group(1) + return (opcode, { + 'EOSZ': eosz, + 'IClass': iclass, + 'Category': category, + 'Extension': extension, + 'IForm': iform, + 'IsaSet': isa_set, + 'OpdsInfo': operands_info, + }) + + except: + continue + else: + print(f'[{opcode}]error ', cmd, file=sys.stderr) + return (opcode, None) + + +if __name__ == '__main__': + args = parse_command_line() + ostream = sys.stdout if args.o == '-' else open(args.o, 'w') + istream = sys.stdin if args.jf == '-' else open(args.jf, 'r') + instr_sched_info = json.load(istream) + + # Fix asm strings. + task_args = [] + for opcode, info in instr_sched_info.items(): + asm_string = info.get('AsmString', None) + if asm_string is not None and opcode not in invalid_opcode_list: + task_args.append([opcode, asm_string, info['Modes']]) + with Pool() as pool: + result = pool.starmap(fix_asm, task_args) + for opcode, asm in result: + instr_sched_info[opcode]['AsmString'] = asm + + # Encode assembly. + task_args = [] + for opcode, info in instr_sched_info.items(): + asm_string = info.get('AsmString', None) + if asm_string is not None and opcode not in invalid_opcode_list: + task_args.append([opcode, asm_string]) + with Pool() as pool: + result = pool.starmap(encode_asm, task_args) + for opcode, encoding_str in result: + instr_sched_info[opcode]['Encoding'] = encoding_str + + # Add xed info. + task_args = [] + for opcode, info in instr_sched_info.items(): + encoding = info.get('Encoding', None) + if encoding is not None: + asm_string = info['AsmString'] + match = re.match(r'.*\.code(\d{2})', asm_string) + mode = 32 + if match: + mode = int(match.group(1)) + task_args.append((opcode, encoding, mode)) + with Pool() as pool: + result = pool.starmap(get_xed_info, task_args) + for opcode, xed_info in result: + if xed_info: + instr_sched_info[opcode]['XedInfo'] = xed_info + + json.dump(instr_sched_info, ostream, indent=2) + istream.close() + ostream.close() diff --git a/llvm/utils/schedtool/tools/proc_res_group_helper.py b/llvm/utils/schedtool/tools/proc_res_group_helper.py new file mode 100755 --- /dev/null +++ b/llvm/utils/schedtool/tools/proc_res_group_helper.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import argparse, json, sys, os, math + +# Add parent dir to path. +sys.path.append(f'{os.path.dirname(os.path.realpath(__file__))}/..') + +from lib import target, llvm_instr, info_parser + + +def parse_command_line(): + parser = argparse.ArgumentParser( + description='Helper to define ProcResGroup.') + parser.add_argument('--target-cpu', required=True, help='target cpu') + parser.add_argument('--jf', + default='-', + help='instruction sched info json file') + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_command_line() + istream = sys.stdin if args.jf == '-' else open(args.jf, 'r') + target_cpu = target.get_target(args.target_cpu) + llvm_instrs = info_parser.parse_llvm_instr_info(json.load(istream), + target_cpu) + ports_set = set() + for llvm_instr in llvm_instrs: + if (llvm_instr.has_uops_info() + and not llvm_instr.is_invalid(target_cpu)): + for uop in llvm_instr.uops_info.uops: + ports_set.add(uop.ports) + ports_groups = sorted(p for p in ports_set if len(p) > 1) + aligned_width = math.ceil( + (max(len(target_cpu.get_ports_name(p)) + for p in ports_groups) + 1) / 2) * 2 + for pg in ports_groups: + pg_name = target_cpu.get_ports_name(pg) + res = f'def {pg_name}' + ' ' * (aligned_width - len(pg_name)) + p_names = [target_cpu.get_ports_name((p, )) for p in pg] + print(res + ': ProcResGroup<[' + ', '.join(p_names) + ']>;') + + istream.close() diff --git a/llvm/utils/schedtool/xed-patch/0001-Dump-eosz-and-operand-s-xtype-width-when-verbosity-i.patch b/llvm/utils/schedtool/xed-patch/0001-Dump-eosz-and-operand-s-xtype-width-when-verbosity-i.patch new file mode 100644 --- /dev/null +++ b/llvm/utils/schedtool/xed-patch/0001-Dump-eosz-and-operand-s-xtype-width-when-verbosity-i.patch @@ -0,0 +1,91 @@ +From 74227952c6472226221c1bb138767800c373b9a8 Mon Sep 17 00:00:00 2001 +From: Haohai Wen +Date: Wed, 23 Mar 2022 13:34:18 +0800 +Subject: [PATCH] Dump eosz and operand's xtype,width when verbosity is greater + than 3 + +--- + include/public/xed/xed-inst.h | 2 +- + src/dec/xed-disas.c | 9 ++++++++- + src/dec/xed-inst.c | 9 ++++++++- + 3 files changed, 17 insertions(+), 3 deletions(-) + +diff --git a/include/public/xed/xed-inst.h b/include/public/xed/xed-inst.h +index d0c6e69..9095f73 100644 +--- a/include/public/xed/xed-inst.h ++++ b/include/public/xed/xed-inst.h +@@ -178,7 +178,7 @@ static XED_INLINE xed_uint32_t xed_operand_imm(const xed_operand_t* p) { + /// @param buf buffer that gets filled in + /// @param buflen maximum buffer length + XED_DLL_EXPORT void +-xed_operand_print(const xed_operand_t* p, char* buf, int buflen); ++xed_operand_print(const xed_operand_t* p, char* buf, int buflen, const xed_uint32_t eosz); + //@} + + /// @name xed_inst_t Template Operand Enum Name Classification +diff --git a/src/dec/xed-disas.c b/src/dec/xed-disas.c +index d533390..2bed3d9 100644 +--- a/src/dec/xed-disas.c ++++ b/src/dec/xed-disas.c +@@ -160,16 +160,23 @@ xed_decoded_inst_dump(const xed_decoded_inst_t* p, char* buf, int buflen) + + blen = xed_strncat(buf,"\n",blen); + noperands = xed_inst_noperands(xi); ++ ++ const xed_uint32_t eosz = xed3_operand_get_eosz(p); + for( i=0;i_name),blen); + blen = xed_strncat(buf,"/",blen); +@@ -122,6 +123,12 @@ void xed_operand_print(const xed_operand_t* p, char* buf, int buflen) { + xed_nonterminal_enum_t2str(xed_operand_nt_lookup_fn_enum(p)), + blen); + } ++ char tmp_buf[16]; ++ blen = xed_strncat(buf,"/",blen); ++ blen = xed_strncat(buf,xed_operand_element_xtype_enum_t2str(p->_xtype),blen); ++ blen = xed_strncat(buf,"/",blen); ++ xed_sprintf_uint32(tmp_buf,xed_operand_width_bits(p, eosz),16); ++ blen = xed_strncat(buf,tmp_buf,blen); + } + + unsigned int xed_attribute_max(void) { +-- +2.29.2 +