diff --git a/llvm/utils/check_cost_tables.py b/llvm/utils/check_cost_tables.py new file mode 100644 --- /dev/null +++ b/llvm/utils/check_cost_tables.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python3 + +# Helper script to compare the TTI cost table values for various IR ops and +# intrinsics against the llvm-mca costs reported from the generated assembly. +# +# As cost tables typically use worst case values, the script runs against a set +# of cpus in a similar level and checks the cost reported by opt --analyze vs +# the highest cost across all those cpus. +# +# By default, the script will exhaustively check all cpulevels and all +# scalar/vector ops up to the max legal vector width (pow2 numelts only), but +# more specific checks can be made with the --cpulevel and --op command args. + +import argparse +import math +import re +import os +import subprocess +import concurrent.futures + +from collections import defaultdict + +class Error(Exception): + """Simple exception type for erroring without a traceback.""" + + +def _run_command(cmd, *, input, op): + try: + return subprocess.run(cmd, input=input, text=True, capture_output=True) + except subprocess.CalledProcessError as exc: + raise Error(f"Error running {cmd} : {op}") from exc + + +def _run_analysis(op, opname, ir, cpu, costkind): + # Run opt to get cost-model report + analysis = _run_command( + [ + args.opt_binary, + "-passes=print", + "-disable-output", + f"-cost-kind={costkind}", + f"-mcpu={cpu}", + f"-mtriple={args.triple}" + ], + input=ir, + op=op, + ) + + # Extract analyze costs + for line in analysis.stderr.splitlines(): + if opname in line: + matches = re.search( + r"Cost Model: Found an estimated cost of (\d+)", line + ) + return float(matches.group(1)) + + return None + + +def _run_codegen(op, ir, cpu): + # Run llc to generate asm + llc = _run_command( + [ + args.llc_binary, + f"-mcpu={cpu}", + f"-mtriple={args.triple}" + ], + input=ir, + op=op, + ) + + # TODO - strip out assembly to pass to llvm-mca to avoid need for asm barriers in IR + + # Run llvm-mca to determine asm statistics + mca = _run_command( + [ + args.llvm_mca_binary, + f"-mcpu={cpu}", + f"-mtriple={args.triple}" + ], + input=llc.stdout, + op=op, + ) + + # Extract mca statistics (worst case cost to use math.ceil() to round up) + costs = {} + + for line in mca.stdout.splitlines(): + if "Instructions:" in line: + matches = re.search(r"Instructions: ([0-9]+)", line) + costs["code-size"] = round(math.ceil(max(float(1), float(matches.group(1)))) / float(100)) + continue + if "Total Cycles:" in line: + matches = re.search(r"Total Cycles: ([0-9]+)", line) + costs["latency"] = round(math.ceil(max(float(1), float(matches.group(1)))) / float(100)) + continue + if "Total uOps:" in line: + matches = re.search(r"Total uOps: ([0-9]+)", line) + costs["size-latency"] = round(math.ceil(max(float(1), float(matches.group(1)))) / float(100)) + continue + if "Block RThroughput:" in line: + matches = re.search(r"Block RThroughput: ([0-9\.]+)", line) + costs["throughput"] = math.ceil(max(float(1), float(matches.group(1)))) + break # Assumes other lines are above rthroughput + + return costs + + +def run_analysis(srctype, dsttype, op, opname, cpus, declarations=""): + costkinds = [ "throughput", "latency", "code-size", "size-latency"]; + + analysis_costs = defaultdict(dict) + mca_costs = defaultdict(dict) + + # Write out candidate IR + ir = "\n".join( + [ + f"define {dsttype} @costfuzz({srctype} %a0, {srctype} %a1, {srctype} %a2) {{", + 'tail call void asm sideeffect "# LLVM-MCA-BEGIN foo", "~{dirflag},~{fpsr},~{flags},~{rsp}"()', + op, + 'tail call void asm sideeffect "# LLVM-MCA-END foo", "~{dirflag},~{fpsr},~{flags},~{rsp}"()', + f"ret {dsttype} %result", + "}", + declarations, + ] + ) + + with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_threads) as e: + analysis_results = defaultdict(dict) + mca_results = {} + + for cpu in cpus: + mca_results[cpu] = e.submit(_run_codegen, op, ir, cpu) + for costkind in costkinds: + analysis_results[costkind][cpu] = e.submit(_run_analysis, op, opname, ir, cpu, costkind) + + for cpu in cpus: + for costkind in costkinds: + analysis_costs[costkind][cpu] = analysis_results[costkind][cpu].result() + + for cpu in cpus: + costs = mca_results[cpu].result() + for costkind in costkinds: + mca_costs[costkind][cpu] = costs[costkind] + + for costkind in costkinds: + minanalysis = min(analysis_costs[costkind].values()) + maxanalysis = max(analysis_costs[costkind].values()) + minmca = min(mca_costs[costkind].values()) + maxmca = max(mca_costs[costkind].values()) + + if maxmca != maxanalysis: + #if abs(maxmca - maxanalysis) > 1: + print( + f"{dsttype} {opname} {srctype}: cost ({minanalysis} - {maxanalysis}) vs {costkind} ({minmca} - {maxmca})" + ) + for cpu in cpus: + print(f" {cpu} : {analysis_costs[costkind][cpu]} vs {mca_costs[costkind][cpu]}") + if args.stop_on_diff: + with open("fuzz.ll", "w") as f: + f.write(ir) + raise SystemExit(-1) + + +def get_float_string(width): + if width == 16: + return "half" + if width == 32: + return "float" + if width == 64: + return "double" + return None + + +def get_type(elementcount, base): + if elementcount == 0: + return base + return f"<{elementcount} x {base}>" + + +def get_typestub(elttype, elementcount, base): + if elementcount == 0: + return f"{elttype}{base}" + return f"v{elementcount}{elttype}{base}" + + +def get_typeistub(elementcount, base): + return get_typestub("i", elementcount, base) + + +def get_typefstub(elementcount, base): + return get_typestub("f", elementcount, base) + + +# TODO - add half conversion +def fp_cast(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [32, 64]: + for dstbasewidth in [32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + srctype = get_type(elementcount, get_float_string(srcbasewidth)) + dsttype = get_type(elementcount, get_float_string(dstbasewidth)) + cmd = f"%result = {op} {srctype} %a0 to {dsttype}" + + if srcbasewidth < dstbasewidth and op == "fpext": + if dstbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + + if srcbasewidth > dstbasewidth and op == "fptrunc": + if srcbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + + +def fp_unaryops(maxwidth, ops, cpus): + for op in ops: + for basewidth in [32, 64]: + for elementcount in [0, 2, 4, 8, 16]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, get_float_string(basewidth)) + cmd = f"%result = {op} {type} %a0" + run_analysis(type, type, cmd, op, cpus) + + +def fp_binops(maxwidth, ops, cpus): + for op in ops: + for basewidth in [32, 64]: + for elementcount in [0, 2, 4, 8, 16]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, get_float_string(basewidth)) + cmd = f"%result = {op} {type} %a0, %a1" + run_analysis(type, type, cmd, op, cpus) + + +# TODO - support bool predicate results for some targets +def fp_cmp(maxwidth, ops, cpus): + for op in ops: + for basewidth in [32, 64]: + for elementcount in [2, 4, 8, 16]: + if (basewidth * elementcount) <= maxwidth: + for cc in [ "oeq", "ogt", "oge", "olt", "ole", "one", "ord", "ueq", "ugt", "uge", "ult", "ule", "une", "uno" ]: + cctype = get_type(elementcount, f"i{1}") + srctype = get_type(elementcount, get_float_string(basewidth)) + dsttype = get_type(elementcount, f"i{basewidth}") + cmd = "\n".join( + [ + f"%cmp = {op} {cc} {srctype} %a0, %a1", + f"%result = sext {cctype} %cmp to {dsttype}", + ] + ) + run_analysis(srctype, dsttype, cmd, f"{op} {cc}", cpus) + + +def int_cast(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [8, 16, 32, 64]: + for dstbasewidth in [8, 16, 32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + srctype = get_type(elementcount, f"i{srcbasewidth}") + dsttype = get_type(elementcount, f"i{dstbasewidth}") + cmd = f"%result = {op} {srctype} %a0 to {dsttype}" + + if srcbasewidth < dstbasewidth and op != "trunc": + if dstbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + + if srcbasewidth > dstbasewidth and op == "trunc": + if srcbasewidth * elementcount <= maxwidth: + if elementcount != 0: + run_analysis(srctype, dsttype, cmd, op, cpus) + + +def int_binops(maxwidth, ops, cpus): + for op in ops: + for basewidth in [8, 16, 32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, f"i{basewidth}") + cmd = f"%result = {op} {type} %a0, %a1" + run_analysis(type, type, cmd, f" {op} ", cpus) + + +def int_shifts(maxwidth, ops, cpus): + for op in ops: + for basewidth in [8, 16, 32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, f"i{basewidth}") + cmd = f"%result = {op} {type} %a0, %a1" + run_analysis(type, type, cmd, op, cpus) + + +# TODO - support bool predicate results for some targets +def int_cmp(maxwidth, ops, cpus): + for op in ops: + for basewidth in [8, 16, 32, 64]: + for elementcount in [2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + for cc in [ "eq", "ne", "ugt", "uge", "ult", "ule", "sgt", "sge", "slt", "sle" ]: + cctype = get_type(elementcount, f"i{1}") + type = get_type(elementcount, f"i{basewidth}") + cmd = "\n".join( + [ + f"%cmp = {op} {cc} {type} %a0, %a1", + f"%result = sext {cctype} %cmp to {type}", + ] + ) + run_analysis(type, type, cmd, f"{op} {cc}", cpus) + + +def int_to_fp(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [8, 16, 32, 64]: + for dstbasewidth in [32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (min(srcbasewidth, dstbasewidth) * elementcount) <= maxwidth: + srctype = get_type(elementcount, f"i{srcbasewidth}") + dsttype = get_type(elementcount, get_float_string(dstbasewidth)) + cmd = f"%result = {op} {srctype} %a0 to {dsttype}" + run_analysis(srctype, dsttype, cmd, op, cpus) + + +def fp_to_int(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [32, 64]: + for dstbasewidth in [8, 16, 32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (min(srcbasewidth, dstbasewidth) * elementcount) <= maxwidth: + srctype = get_type(elementcount, get_float_string(srcbasewidth)) + dsttype = get_type(elementcount, f"i{dstbasewidth}") + cmd = f"%result = {op} {srctype} %a0 to {dsttype}" + run_analysis(srctype, dsttype, cmd, op, cpus) + + +def int_unaryintrinsics(maxwidth, ops, cpus, boolarg = None): + for op in ops: + for basewidth in [8, 16, 32, 64]: + if op == "bswap" and basewidth == 8: + continue + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, f"i{basewidth}") + stub = get_typeistub(elementcount, basewidth) + if boolarg is not None: + boolval = -1 if boolarg else 0 + cmd = f"%result = call {type} @llvm.{op}.{stub}({type} %a0, i1 {boolval})" + declaration = f"declare {type} @llvm.{op}.{stub}({type}, i1)" + else: + cmd = f"%result = call {type} @llvm.{op}.{stub}({type} %a0)" + declaration = f"declare {type} @llvm.{op}.{stub}({type})" + run_analysis(type, type, cmd, op, cpus, declaration) + + +def int_binaryintrinsics(maxwidth, ops, cpus): + for op in ops: + for basewidth in [8, 16, 32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, f"i{basewidth}") + stub = get_typeistub(elementcount, basewidth) + cmd = f"%result = call {type} @llvm.{op}.{stub}({type} %a0, {type} %a1)" + declaration = f"declare {type} @llvm.{op}.{stub}({type}, {type})" + run_analysis(type, type, cmd, op, cpus, declaration) + + +def int_ternaryintrinsics(maxwidth, ops, cpus): + for op in ops: + for basewidth in [8, 16, 32, 64]: + for elementcount in [0, 2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + type = get_type(elementcount, f"i{basewidth}") + stub = get_typeistub(elementcount, basewidth) + cmd = f"%result = call {type} @llvm.{op}.{stub}({type} %a0, {type} %a1, {type} %a2)" + declaration = f"declare {type} @llvm.{op}.{stub}({type}, {type}, {type})" + run_analysis(type, type, cmd, op, cpus, declaration) + + +def int_reductions(maxwidth, ops, cpus): + for op in ops: + for basewidth in [8, 16, 32, 64]: + for elementcount in [2, 4, 8, 16, 32, 64]: + if (basewidth * elementcount) <= maxwidth: + vectype = get_type(elementcount, f"i{basewidth}") + scltype = get_type(0, f"i{basewidth}") + stub = get_typeistub(elementcount, basewidth) + cmd = f"%result = call {scltype} @llvm.vector.reduce.{op}.{stub}({vectype} %a0)" + declaration = f"declare {scltype} @llvm.vector.reduce.{op}.{stub}({vectype})" + run_analysis(vectype, scltype, cmd, f"vector.reduce.{op}", cpus, declaration) + + +def filter_ops(targetops, ops): + if len(targetops) == 0: + return ops + + selectops = list() + for targetop in targetops: + if ops.count(targetop): + selectops.append(targetop) + return selectops + + +def test_cpus(targetops, maxwidth, cpus): + ops = filter_ops(targetops, ["fpext", "fptrunc"]) + fp_cast(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["fneg"]) + fp_unaryops(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["fadd", "fsub", "fmul", "fdiv"]) + fp_binops(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["fcmp"]) + fp_cmp(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["select"]) + # TODO - select with fcmp + + # TODO - fabs, fsqrt, ceil, floor, trunc, rint, nearbyint + # fp_unaryintrinsics() + + # TODO - copysign, maxnum, maxinum, minnum, mininum + # fp_binaryintrinsics() + + # TODO - reduction op filtering + # if len(targetops) == 0 or "reduce" in targetops: + # fp_reductions(maxwidth, [ "fadd", "fmul", "fmax", "fmin" ], cpus) + + ops = filter_ops(targetops, ["sext", "zext", "trunc"]) + int_cast(maxwidth, ops, cpus) + + # TODO - sdiv/udiv/srem/urem (+ by constant/pow2 cases) + ops = filter_ops(targetops, ["and", "or", "xor", "add", "sub", "mul"]) + int_binops(maxwidth, ops, cpus) + + # TODO - uniform / constant shift amount costs + ops = filter_ops(targetops, ["shl", "lshr", "ashr"]) + int_shifts(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["icmp"]) + int_cmp(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["select"]) + # TODO - select with icmp + + # TODO - bitcasts i1/i32/i64/float/double + + # TODO - vector ops (extract/insert/shuffle) + + # TODO - better reduction op filtering + if len(targetops) == 0 or "reduce" in targetops: + int_reductions( + maxwidth, + ["and", "or", "xor", "add", "mul", "smax", "smin", "umax", "umin"], + cpus, + ) + + ops = filter_ops(targetops, ["sitofp", "uitofp"]) + int_to_fp(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["fptosi", "fptoui"]) + fp_to_int(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["bitreverse", "bswap", "ctpop"]) + int_unaryintrinsics(maxwidth, ops, cpus) + + ops = filter_ops(targetops, ["ctlz", "cttz"]) + int_unaryintrinsics(maxwidth, ops, cpus, False) + int_unaryintrinsics(maxwidth, ops, cpus, True) + + ops = filter_ops(targetops, ["smax", "smin", "umax", "umin"]) + int_binaryintrinsics(maxwidth, ops, cpus) + + # TODO - uniform / constant shift amount costs + ops = filter_ops(targetops, ["fshl", "fshr"]) + int_ternaryintrinsics(maxwidth, ops, cpus) + + +def main(): + default_num_threads = os.cpu_count() + + # TODO - 2 modes - (a) create generic codegen for sse level and compare cpu analysis + # (b) create generic codegen for each cpu of a similar level and compare cpu analysis + cpulevels = { + "avx512" : (512, ["x86-64-v4", "skylake-avx512", "icelake-server"]), + "avx512f" : (512, ["knl"]), + "avx2" : (256, ["x86-64-v3", "broadwell", "haswell", "skylake", "alderlake", "znver1", "znver2", "znver3"]), + "avx1" : (256, ["bdver2", "btver2", "sandybridge"]), + "sse4.2" : (128, ["x86-64-v2", "silvermont", "goldmont", "nehalem"]), + "sse4.1" : (128, ["penryn", "core2"]), + "ssse3" : (128, ["atom"]), + "sse3" : (128, ["atom"]), + "sse2" : (128, ["x86-64"]), + } + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--triple", + metavar="", + default="x86_64--", + help="Specify the target triple (default: x86_64--)", + ) + parser.add_argument( + "--cpulevel", + choices=cpulevels.keys(), + default=None, + help="Only test cpus specific to a cpulevel", + ) + # TODO - --op(s) command line handling to select multiple ops for testing + parser.add_argument( + "--op", metavar="", default=None, help="Only test requested op" + ) + parser.add_argument( + "--stop-on-diff", + action="store_true", + help="Stop on first analysis/mca discrepancy, leaves fuzz.ll temp file", + ) + parser.add_argument( + "--opt-binary", + metavar="", + default="opt", + help='The "opt" binary to use to analyze the test case IR (default: opt)', + ) + parser.add_argument( + "--llc-binary", + metavar="", + default="llc", + help='The "llc" binary to use to generate the test case assembly (default: llc)', + ) + parser.add_argument( + "--llvm-mca-binary", + metavar="", + default="llvm-mca", + help='The "llvm-mca "binary to use to analyze the test case assembly (default: llvm-mca)', + ) + parser.add_argument( + "-j", + "--num-threads", + type=int, + default=default_num_threads, + help=f"default:{default_num_threads}", + ) + + global args + args = parser.parse_args() + + targetops = list() + if args.op is not None: + targetops = args.op.split(",") + + targetcpus = ["avx512", "avx2", "avx1", "sse4.2", "ssse3", "sse2"] + if args.cpulevel is not None: + targetcpus = [args.cpulevel] + + for targetcpu in targetcpus: + (maxwidth, cpus) = cpulevels[targetcpu] + test_cpus(targetops, maxwidth, cpus) + + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except Error as error: + print(f"error: {error}") + raise SystemExit(1) from error