diff --git a/llvm/utils/check_cost_tables.py b/llvm/utils/check_cost_tables.py new file mode 100644 --- /dev/null +++ b/llvm/utils/check_cost_tables.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 + +# Helper script to compare the TTI cost table values for various IR ops and +# intrinsics against the llvm-mca costs reported from the generated assembly. +# +# As cost tables typically use worst case values, the script runs against a set +# of cpus in a similar level and checks the cost reported by opt --analyze vs +# the highest cost across all those cpus. +# +# By default, the script will exhaustively check all cpulevels and all +# scalar/vector ops up to the max legal vector width (pow2 numelts only), but +# more specific checks can be made with the --cpulevel and --op command args. + +import argparse, sys, re, math, os + +def run_analysis(srctype, dsttype, op, opname, cpus, declarations = ""): + costs = {} + recipthroughputs = {} + + # TODO - stop writing/reading files and just pipe stdout/stdin to the tools + # TODO - RecipThroughput only - add Latency/CodeSize/SizeAndLatency support + + # Write out candidate IR + f = open("fuzz.ll", "w") + print("define {} @costfuzz({} %a0, {} %a1, {} %a2) {{".format(dsttype, srctype, srctype, srctype), file=f) + print("tail call void asm sideeffect \"# LLVM-MCA-BEGIN foo\", \"~{dirflag},~{fpsr},~{flags},~{rsp}\"()", file=f) + print(op, file=f) + print("tail call void asm sideeffect \"# LLVM-MCA-END foo\", \"~{dirflag},~{fpsr},~{flags},~{rsp}\"()", file=f) + print("ret {} %result".format(dsttype), file=f) + print("}", file=f) + print(declarations, file=f) + f.close() + + # TODO - is it worth trying to run these in parallel? + for cpu in cpus: + # Run cost-model analysis + costscmd = "{} -analyze -cost-model -mcpu={} -mtriple={} fuzz.ll -S -o analyze.txt".format(args.opt_binary, cpu, args.triple) + if os.system(costscmd) != 0: + print("Error running opt -mcpu={} : {}".format(cpu, op)) + sys.exit( 1 ) + + # Run llc + llccmd = "{} -mcpu={} -mtriple={} fuzz.ll -o fuzz.s".format(args.llc_binary, cpu, args.triple) + if os.system(llccmd) != 0: + print("Error running llc -mcpu={} : {}".format(cpu, op)) + sys.exit( 1 ) + + # TODO - strip out assembly to pass to llvm-mca to avoid need for asm barriers in IR + + # Run llvm-mca + mcacmd = "{} -mcpu={} -mtriple={} fuzz.s -o mca.txt".format(args.llvm_mca_binary, cpu, args.triple) + if os.system(mcacmd) != 0: + print("Error running llvm-mca -mcpu={} : {}".format(cpu, op)) + sys.exit( 1 ) + + # Extract costs + f = open("analyze.txt", "r") + for line in f.readlines(): + if line.__contains__(opname): + matches = re.search(r"Cost Model: Found an estimated cost of (\d+)", line) + costs[cpu] = float(matches.group(1)) + break + f.close() + + # Extract mca (worst case cost to use math.ceil() to round up) + f = open("mca.txt", "r") + for line in f.readlines(): + if line.__contains__("Block RThroughput:"): + matches = re.search(r"Block RThroughput: ([0-9\.]+)", line) + recipthroughputs[cpu] = math.ceil(max(float(1), float(matches.group(1)))) + break + f.close() + + mincost = min(costs.values()) + maxcost = max(costs.values()) + minrecipthroughput = min(recipthroughputs.values()) + maxrecipthroughput = max(recipthroughputs.values()) + + if maxcost != maxrecipthroughput: + print("{} {} {}: cost ({} - {}) vs recipthroughput ({} - {})".format(dsttype, opname, srctype, mincost, maxcost, minrecipthroughput, maxrecipthroughput)) + for cpu in cpus: + print(" {} : {} vs {}".format(cpu, costs[cpu], recipthroughputs[cpu])) + if args.stop_on_diff: + exit(-1) + +def get_float_string(width): + if width == 16: + return "half" + if width == 32: + return "float" + if width == 64: + return "double" + return None + +def get_type(elementcount, base): + if elementcount == 0: + return base + return "<{} x {}>".format(elementcount, base) + +def get_typestub(elttype, elementcount, base): + if elementcount == 0: + return "{}{}".format(elttype, base) + return "v{}{}{}".format(elementcount, elttype, base) + +def get_typeistub(elementcount, base): + return get_typestub('i', elementcount, base) + +def get_typefstub(elementcount, base): + return get_typestub('f', elementcount, base) + +# TODO - add half conversion +def fp_cast(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [ 32, 64 ]: + for dstbasewidth in [ 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + srctype = get_type(elementcount, get_float_string(srcbasewidth)) + dsttype = get_type(elementcount, get_float_string(dstbasewidth)) + cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype) + + if srcbasewidth < dstbasewidth and op == "fpext": + if dstbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + + if srcbasewidth > dstbasewidth and op == "fptrunc": + if srcbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + +def fp_unaryops(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, get_float_string(basewidth)) + cmd = "%result = {} {} %a0".format(op, type) + run_analysis(type, type, cmd, op, cpus) + +def fp_binops(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, get_float_string(basewidth)) + cmd = "%result = {} {} %a0, %a1".format(op, type) + run_analysis(type, type, cmd, op, cpus) + +# TODO - support bool predicate results for some targets +def fp_cmp(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 32, 64 ]: + for elementcount in [ 2, 4, 8, 16 ]: + if basewidth * elementcount <= maxwidth: + for cc in [ "oeq", "ogt", "oge", "olt", "ole", "one", "ord", "ueq", "ugt", "uge", "ult", "ule", "une", "uno" ]: + cctype = get_type(elementcount, "i{}".format(1)) + srctype = get_type(elementcount, get_float_string(basewidth)) + dsttype = get_type(elementcount, "i{}".format(basewidth)) + cmd = "%cmp = {} {} {} %a0, %a1\n%result = sext {} %cmp to {}".format(op, cc, srctype, cctype, dsttype) + run_analysis(srctype, dsttype, cmd, "{} {}".format(op, cc), cpus) + +def int_cast(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [ 8, 16, 32, 64 ]: + for dstbasewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + srctype = get_type(elementcount, "i{}".format(srcbasewidth)) + dsttype = get_type(elementcount, "i{}".format(dstbasewidth)) + cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype) + + if srcbasewidth < dstbasewidth and op != "trunc": + if dstbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + + if srcbasewidth > dstbasewidth and op == "trunc" and elementcount != 0: + if srcbasewidth * elementcount <= maxwidth: + run_analysis(srctype, dsttype, cmd, op, cpus) + +def int_binops(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, "i{}".format(basewidth)) + cmd = "%result = {} {} %a0, %a1".format(op, type) + run_analysis(type, type, cmd, op, cpus) + +def int_shifts(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, "i{}".format(basewidth)) + cmd = "%result = {} {} %a0, %a1".format(op, type) + run_analysis(type, type, cmd, op, cpus) + +# TODO - support bool predicate results for some targets +def int_cmp(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + for cc in [ "eq", "ne", "ugt", "uge", "ult", "ule", "sgt", "sge", "slt", "sle" ]: + cctype = get_type(elementcount, "i{}".format(1)) + type = get_type(elementcount, "i{}".format(basewidth)) + cmd = "%cmp = {} {} {} %a0, %a1\n%result = sext {} %cmp to {}".format(op, cc, type, cctype, type) + run_analysis(type, type, cmd, "{} {}".format(op, cc), cpus) + +def int_to_fp(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [ 32 ]: + for dstbasewidth in [ 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if (srcbasewidth * elementcount) <= maxwidth or (dstbasewidth * elementcount) <= maxwidth: + srctype = get_type(elementcount, "i{}".format(srcbasewidth)) + dsttype = get_type(elementcount, get_float_string(dstbasewidth)) + cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype) + run_analysis(srctype, dsttype, cmd, op, cpus) + +def fp_to_int(maxwidth, ops, cpus): + for op in ops: + for srcbasewidth in [ 32, 64 ]: + for dstbasewidth in [ 32 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if (srcbasewidth * elementcount) <= maxwidth or (dstbasewidth * elementcount) <= maxwidth: + srctype = get_type(elementcount, get_float_string(srcbasewidth)) + dsttype = get_type(elementcount, "i{}".format(dstbasewidth)) + cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype) + run_analysis(srctype, dsttype, cmd, op, cpus) + +def int_unaryintrinsics(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + if op == "bswap" and basewidth == 8: + continue + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, "i{}".format(basewidth)) + stub = get_typeistub(elementcount, basewidth) + cmd = "%result = call {} @llvm.{}.{}({} %a0)".format(type, op, stub, type) + declaration = "declare {} @llvm.{}.{}({})".format(type, op, stub, type) + run_analysis(type, type, cmd, op, cpus, declaration) + +def int_binaryintrinsics(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, "i{}".format(basewidth)) + stub = get_typeistub(elementcount, basewidth) + cmd = "%result = call {} @llvm.{}.{}({} %a0, {} %a1)".format(type, op, stub, type, type) + declaration = "declare {} @llvm.{}.{}({}, {})".format(type, op, stub, type, type) + run_analysis(type, type, cmd, op, cpus, declaration) + +def int_ternaryintrinsics(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + type = get_type(elementcount, "i{}".format(basewidth)) + stub = get_typeistub(elementcount, basewidth) + cmd = "%result = call {} @llvm.{}.{}({} %a0, {} %a1, {} %a2)".format(type, op, stub, type, type, type) + declaration = "declare {} @llvm.{}.{}({}, {}, {})".format(type, op, stub, type, type, type) + run_analysis(type, type, cmd, op, cpus, declaration) + +def int_reductions(maxwidth, ops, cpus): + for op in ops: + for basewidth in [ 8, 16, 32, 64 ]: + for elementcount in [ 2, 4, 8, 16, 32, 64 ]: + if basewidth * elementcount <= maxwidth: + vectype = get_type(elementcount, "i{}".format(basewidth)) + scltype = get_type(0, "i{}".format(basewidth)) + stub = get_typeistub(elementcount, basewidth) + cmd = "%result = call {} @llvm.vector.reduce.{}.{}({} %a0)".format(scltype, op, stub, vectype) + declaration = "declare {} @llvm.vector.reduce.{}.{}({})".format(scltype, op, stub, vectype) + run_analysis(vectype, scltype, cmd, "vector.reduce.{}".format(op), cpus, declaration) + +def filter_ops(targetops, ops): + if len(targetops) == 0: + return ops + + selectops = list() + for targetop in targetops: + if ops.count(targetop): + selectops.append(targetop) + return selectops + +def test_cpus(targetops, maxwidth, cpus): + ops = filter_ops(targetops, [ "fpext", "fptrunc" ]) + fp_cast(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "fneg" ]) + fp_unaryops(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "fadd", "fsub", "fmul", "fdiv" ]) + fp_binops(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "fcmp" ]) + fp_cmp(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "select" ]) + # TODO - select with fcmp + + # TODO - fabs, fsqrt, ceil, floor, trunc, rint, nearbyint + # fp_unaryintrinsics() + + # TODO - copysign, maxnum, maxinum, minnum, mininum + # fp_binaryintrinsics() + + # TODO - reduction op filtering + #if len(targetops) == 0 or "reduce" in targetops: + #fp_reductions(maxwidth, [ "fadd", "fmul", "fmax", "fmin" ], cpus) + + ops = filter_ops(targetops, [ "sext", "zext", "trunc" ]) + int_cast(maxwidth, ops, cpus) + + # TODO - sdiv/udiv/srem/urem (+ by constant/pow2 cases) + ops = filter_ops(targetops, [ "and", "or", "xor", "add", "sub", "mul" ]) + int_binops(maxwidth, ops, cpus) + + # TODO - uniform / constant shift amount costs + ops = filter_ops(targetops, [ "shl", "lshr", "ashr" ]) + int_shifts(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "icmp" ]) + int_cmp(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "select" ]) + # TODO - select with icmp + + # TODO - bitcasts i1/i32/i64/float/double + + # TODO - vector ops (extract/insert/shuffle) + + # TODO - better reduction op filtering + if len(targetops) == 0 or "reduce" in targetops: + int_reductions(maxwidth, [ "and", "or", "xor", "add", "mul", "smax", "smin", "umax", "umin" ], cpus) + + ops = filter_ops(targetops, [ "sitofp", "uitofp" ]) + int_to_fp(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "fptosi", "fptoui" ]) + fp_to_int(maxwidth, ops, cpus) + + ops = filter_ops(targetops, [ "bitreverse", "bswap", "ctpop" ]) + int_unaryintrinsics(maxwidth, ops, cpus) + # TODO - ctlz, cttz + + ops = filter_ops(targetops, [ "smax", "smin", "umax", "umin" ]) + int_binaryintrinsics(maxwidth, ops, cpus) + + # TODO - uniform / constant shift amount costs + ops = filter_ops(targetops, [ "fshl", "fshr" ]) + int_ternaryintrinsics(maxwidth, ops, cpus) + + +def main(): + # TODO - 2 modes - (a) create generic codegen for sse level and compare cpu analysis + # (b) create generic codegen for each cpu of a similar level and compare cpu analysis + # TODO - How should we test sandybridge (default) on other levels? What about other cpus? + cpulevels = { + "avx512" : ( 512, [ "skylake-avx512" ] ), + "avx512f" : ( 512, [ "knl" ] ), + "avx2" : ( 256, [ "broadwell", "haswell", "skylake", "znver1", "znver2", "znver3" ] ), + "avx1" : ( 256, [ "bdver2", "btver2", "sandybridge" ] ), + "sse4.2" : ( 128, [ "slm" ] ), + "sse4.1" : ( 128, [ "slm" ] ), + "ssse3" : ( 128, [ "atom" ] ), + "sse3" : ( 128, [ "atom" ] ), + "sse2" : ( 128, [ "atom" ] ), + } + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--triple', + metavar='', + default='x86_64--', + help='Specify the target triple (default: x86_64--)') + parser.add_argument('--cpulevel', + metavar='[ssse3,sse4.2,avx1,avx2,avx512]', + default=None, + help='Only test cpus specific to a cpulevel') + # TODO - --op(s) command line handling to select multiple ops for testing + parser.add_argument('--op', + metavar='', + default=None, + help='Only test requested op') + parser.add_argument('--stop-on-diff', + action='store_true', + help='Stop on first analysis/mca discrepancy, leaves fuzz.ll, analyze.txt, fuzz.s and mca.txt temp files') + parser.add_argument('--opt-binary', + metavar='', + default='opt', + help='The "opt" binary to use to analyze the test case IR (default: opt)') + parser.add_argument('--llc-binary', + metavar='', + default='llc', + help='The "llc" binary to use to generate the test case assembly (default: llc)') + parser.add_argument('--llvm-mca-binary', + metavar='', + default='llvm-mca', + help='The "llvm-mca "binary to use to analyze the test case assembly (default: llvm-mca)') + + global args + args = parser.parse_args() + + targetops = list() + if args.op != None: + targetops = [ args.op ] + + targetcpus = [ "avx512", "avx2", "avx1", "sse4.2", "ssse3" ] + if args.cpulevel != None: + targetcpus = [ args.cpulevel ] + + for targetcpu in targetcpus: + (maxwidth, cpus) = cpulevels[targetcpu] + test_cpus(targetops, maxwidth, cpus) + + return 0 + +if __name__ == '__main__': + main()