diff --git a/llvm/utils/check_cost_tables.py b/llvm/utils/check_cost_tables.py
new file mode 100644
--- /dev/null
+++ b/llvm/utils/check_cost_tables.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+
+# Helper script to compare the TTI cost table values for various IR ops and
+# intrinsics against the llvm-mca costs reported from the generated assembly.
+#
+# As cost tables typically use worst case values, the script runs against a set
+# of cpus in a similar level and checks the cost reported by opt --analyze vs
+# the highest cost across all those cpus.
+#
+# By default, the script will exhaustively check all cpulevels and all
+# scalar/vector ops up to the max legal vector width (pow2 numelts only), but
+# more specific checks can be made with the --cpulevel and --op command args.
+
+import argparse, sys, re, math, os
+
+def run_analysis(srctype, dsttype, op, opname, cpus, declarations = ""):
+  costs = {}
+  recipthroughputs = {}
+
+  # TODO - stop writing/reading files and just pipe stdout/stdin to the tools
+  # TODO - RecipThroughput only - add Latency/CodeSize/SizeAndLatency support
+
+  # Write out candidate IR
+  f = open("fuzz.ll", "w")
+  print("define {} @costfuzz({} %a0, {} %a1, {} %a2) {{".format(dsttype, srctype, srctype, srctype), file=f)
+  print("tail call void asm sideeffect \"# LLVM-MCA-BEGIN foo\", \"~{dirflag},~{fpsr},~{flags},~{rsp}\"()", file=f)
+  print(op, file=f)
+  print("tail call void asm sideeffect \"# LLVM-MCA-END foo\", \"~{dirflag},~{fpsr},~{flags},~{rsp}\"()", file=f)
+  print("ret {} %result".format(dsttype), file=f)
+  print("}", file=f)
+  print(declarations, file=f)
+  f.close()
+
+  # TODO - is it worth trying to run these in parallel?
+  for cpu in cpus:
+    # Run cost-model analysis
+    costscmd = "{} -analyze -cost-model -mcpu={} -mtriple={} fuzz.ll -S -o analyze.txt".format(args.opt_binary, cpu, args.triple)
+    if os.system(costscmd) != 0:
+      print("Error running opt -mcpu={} : {}".format(cpu, op))
+      sys.exit( 1 )
+
+    # Run llc
+    llccmd = "{} -mcpu={} -mtriple={} fuzz.ll -o fuzz.s".format(args.llc_binary, cpu, args.triple)
+    if os.system(llccmd) != 0:
+      print("Error running llc -mcpu={} : {}".format(cpu, op))
+      sys.exit( 1 )
+
+    # TODO - strip out assembly to pass to llvm-mca to avoid need for asm barriers in IR
+
+    # Run llvm-mca
+    mcacmd = "{} -mcpu={} -mtriple={} fuzz.s -o mca.txt".format(args.llvm_mca_binary, cpu, args.triple)
+    if os.system(mcacmd) != 0:
+      print("Error running llvm-mca -mcpu={} : {}".format(cpu, op))
+      sys.exit( 1 )
+
+    # Extract costs
+    f = open("analyze.txt", "r")
+    for line in f.readlines():
+      if line.__contains__(opname):
+        matches = re.search(r"Cost Model: Found an estimated cost of (\d+)", line)
+        costs[cpu] = float(matches.group(1))
+        break
+    f.close()
+
+    # Extract mca (worst case cost to use math.ceil() to round up)
+    f = open("mca.txt", "r")
+    for line in f.readlines():
+      if line.__contains__("Block RThroughput:"):
+        matches = re.search(r"Block RThroughput: ([0-9\.]+)", line)
+        recipthroughputs[cpu] = math.ceil(max(float(1), float(matches.group(1))))
+        break
+    f.close()
+
+  mincost = min(costs.values())
+  maxcost = max(costs.values())
+  minrecipthroughput = min(recipthroughputs.values())
+  maxrecipthroughput = max(recipthroughputs.values())
+
+  if maxcost != maxrecipthroughput:
+    print("{} {} {}: cost ({} - {}) vs recipthroughput ({} - {})".format(dsttype, opname, srctype, mincost, maxcost, minrecipthroughput, maxrecipthroughput))
+    for cpu in cpus:
+      print("  {} : {} vs {}".format(cpu, costs[cpu], recipthroughputs[cpu]))
+    if args.stop_on_diff:
+      exit(-1)
+
+def get_float_string(width):
+  if width == 16:
+    return "half"
+  if width == 32:
+    return "float"
+  if width == 64:
+    return "double"
+  return None
+
+def get_type(elementcount, base):
+  if elementcount == 0:
+    return base
+  return "<{} x {}>".format(elementcount, base)
+
+def get_typestub(elttype, elementcount, base):
+  if elementcount == 0:
+    return "{}{}".format(elttype, base)
+  return "v{}{}{}".format(elementcount, elttype, base)
+
+def get_typeistub(elementcount, base):
+  return get_typestub('i', elementcount, base)
+
+def get_typefstub(elementcount, base):
+  return get_typestub('f', elementcount, base)
+
+# TODO - add half conversion
+def fp_cast(maxwidth, ops, cpus):
+  for op in ops:
+    for srcbasewidth in [ 32, 64 ]:
+      for dstbasewidth in [ 32, 64 ]:
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          srctype = get_type(elementcount, get_float_string(srcbasewidth))
+          dsttype = get_type(elementcount, get_float_string(dstbasewidth))
+          cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype)
+
+          if srcbasewidth < dstbasewidth and op == "fpext":
+            if dstbasewidth * elementcount <= maxwidth:
+              run_analysis(srctype, dsttype, cmd, op, cpus)
+
+          if srcbasewidth > dstbasewidth and op == "fptrunc":
+            if srcbasewidth * elementcount <= maxwidth:
+              run_analysis(srctype, dsttype, cmd, op, cpus)
+
+def fp_unaryops(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 32, 64 ]:
+      for elementcount in [ 0, 2, 4, 8, 16 ]:
+        if basewidth * elementcount <= maxwidth:
+          type = get_type(elementcount, get_float_string(basewidth))
+          cmd = "%result = {} {} %a0".format(op, type)
+          run_analysis(type, type, cmd, op, cpus)
+
+def fp_binops(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 32, 64 ]:
+      for elementcount in [ 0, 2, 4, 8, 16 ]:
+        if basewidth * elementcount <= maxwidth:
+          type = get_type(elementcount, get_float_string(basewidth))
+          cmd = "%result = {} {} %a0, %a1".format(op, type)
+          run_analysis(type, type, cmd, op, cpus)
+
+# TODO - support bool predicate results for some targets
+def fp_cmp(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 32, 64 ]:
+      for elementcount in [ 2, 4, 8, 16 ]:
+        if basewidth * elementcount <= maxwidth:
+          for cc in [ "oeq", "ogt", "oge", "olt", "ole", "one", "ord", "ueq", "ugt", "uge", "ult", "ule", "une", "uno" ]:
+            cctype = get_type(elementcount, "i{}".format(1))
+            srctype = get_type(elementcount, get_float_string(basewidth))
+            dsttype = get_type(elementcount, "i{}".format(basewidth))
+            cmd = "%cmp = {} {} {} %a0, %a1\n%result = sext {} %cmp to {}".format(op, cc, srctype, cctype, dsttype)
+            run_analysis(srctype, dsttype, cmd, "{} {}".format(op, cc), cpus)
+
+def int_cast(maxwidth, ops, cpus):
+  for op in ops:
+    for srcbasewidth in [ 8, 16, 32, 64 ]:
+      for dstbasewidth in [ 8, 16, 32, 64 ]:
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          srctype = get_type(elementcount, "i{}".format(srcbasewidth))
+          dsttype = get_type(elementcount, "i{}".format(dstbasewidth))
+          cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype)
+
+          if srcbasewidth < dstbasewidth and op != "trunc":
+            if dstbasewidth * elementcount <= maxwidth:
+              run_analysis(srctype, dsttype, cmd, op, cpus)
+
+          if srcbasewidth > dstbasewidth and op == "trunc" and elementcount != 0:
+            if srcbasewidth * elementcount <= maxwidth:
+              run_analysis(srctype, dsttype, cmd, op, cpus)
+
+def int_binops(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 8, 16, 32, 64 ]:
+      for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+        if basewidth * elementcount <= maxwidth:
+          type = get_type(elementcount, "i{}".format(basewidth))
+          cmd = "%result = {} {} %a0, %a1".format(op, type)
+          run_analysis(type, type, cmd, op, cpus)
+
+def int_shifts(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 8, 16, 32, 64 ]:
+      for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+        if basewidth * elementcount <= maxwidth:
+          type = get_type(elementcount, "i{}".format(basewidth))
+          cmd = "%result = {} {} %a0, %a1".format(op, type)
+          run_analysis(type, type, cmd, op, cpus)
+
+# TODO - support bool predicate results for some targets
+def int_cmp(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 8, 16, 32, 64 ]:
+      for elementcount in [ 2, 4, 8, 16, 32, 64 ]:
+        if basewidth * elementcount <= maxwidth:
+          for cc in [ "eq", "ne", "ugt", "uge", "ult", "ule", "sgt", "sge", "slt", "sle" ]:
+            cctype = get_type(elementcount, "i{}".format(1))
+            type = get_type(elementcount, "i{}".format(basewidth))
+            cmd = "%cmp = {} {} {} %a0, %a1\n%result = sext {} %cmp to {}".format(op, cc, type, cctype, type)
+            run_analysis(type, type, cmd, "{} {}".format(op, cc), cpus)
+
+def int_to_fp(maxwidth, ops, cpus):
+  for op in ops:
+    for srcbasewidth in [ 32 ]:
+      for dstbasewidth in [ 32, 64 ]:
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          if (srcbasewidth * elementcount) <= maxwidth or (dstbasewidth * elementcount) <= maxwidth:
+            srctype = get_type(elementcount, "i{}".format(srcbasewidth))
+            dsttype = get_type(elementcount, get_float_string(dstbasewidth))
+            cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype)
+            run_analysis(srctype, dsttype, cmd, op, cpus)
+
+def fp_to_int(maxwidth, ops, cpus):
+  for op in ops:
+    for srcbasewidth in [ 32, 64 ]:
+      for dstbasewidth in [ 32 ]:
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          if (srcbasewidth * elementcount) <= maxwidth or (dstbasewidth * elementcount) <= maxwidth:
+            srctype = get_type(elementcount, get_float_string(srcbasewidth))
+            dsttype = get_type(elementcount, "i{}".format(dstbasewidth))
+            cmd = "%result = {} {} %a0 to {}".format(op, srctype, dsttype)
+            run_analysis(srctype, dsttype, cmd, op, cpus)
+
+def int_unaryintrinsics(maxwidth, ops, cpus):
+  for op in ops:
+      for basewidth in [ 8, 16, 32, 64 ]:
+        if op == "bswap" and basewidth == 8:
+          continue
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          if basewidth * elementcount <= maxwidth:
+            type = get_type(elementcount, "i{}".format(basewidth))
+            stub = get_typeistub(elementcount, basewidth)
+            cmd = "%result = call {} @llvm.{}.{}({} %a0)".format(type, op, stub, type)
+            declaration = "declare {} @llvm.{}.{}({})".format(type, op, stub, type)
+            run_analysis(type, type, cmd, op, cpus, declaration)
+
+def int_binaryintrinsics(maxwidth, ops, cpus):
+  for op in ops:
+      for basewidth in [ 8, 16, 32, 64 ]:
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          if basewidth * elementcount <= maxwidth:
+            type = get_type(elementcount, "i{}".format(basewidth))
+            stub = get_typeistub(elementcount, basewidth)
+            cmd = "%result = call {} @llvm.{}.{}({} %a0, {} %a1)".format(type, op, stub, type, type)
+            declaration = "declare {} @llvm.{}.{}({}, {})".format(type, op, stub, type, type)
+            run_analysis(type, type, cmd, op, cpus, declaration)
+
+def int_ternaryintrinsics(maxwidth, ops, cpus):
+  for op in ops:
+      for basewidth in [ 8, 16, 32, 64 ]:
+        for elementcount in [ 0, 2, 4, 8, 16, 32, 64 ]:
+          if basewidth * elementcount <= maxwidth:
+            type = get_type(elementcount, "i{}".format(basewidth))
+            stub = get_typeistub(elementcount, basewidth)
+            cmd = "%result = call {} @llvm.{}.{}({} %a0, {} %a1, {} %a2)".format(type, op, stub, type, type, type)
+            declaration = "declare {} @llvm.{}.{}({}, {}, {})".format(type, op, stub, type, type, type)
+            run_analysis(type, type, cmd, op, cpus, declaration)
+
+def int_reductions(maxwidth, ops, cpus):
+  for op in ops:
+    for basewidth in [ 8, 16, 32, 64 ]:
+      for elementcount in [ 2, 4, 8, 16, 32, 64 ]:
+        if basewidth * elementcount <= maxwidth:
+          vectype = get_type(elementcount, "i{}".format(basewidth))
+          scltype = get_type(0, "i{}".format(basewidth))
+          stub = get_typeistub(elementcount, basewidth)
+          cmd = "%result = call {} @llvm.vector.reduce.{}.{}({} %a0)".format(scltype, op, stub, vectype)
+          declaration = "declare {} @llvm.vector.reduce.{}.{}({})".format(scltype, op, stub, vectype)
+          run_analysis(vectype, scltype, cmd, "vector.reduce.{}".format(op), cpus, declaration)
+
+def filter_ops(targetops, ops):
+  if len(targetops) == 0:
+    return ops
+
+  selectops = list()
+  for targetop in targetops:
+    if ops.count(targetop):
+      selectops.append(targetop)
+  return selectops
+
+def test_cpus(targetops, maxwidth, cpus):
+  ops = filter_ops(targetops, [ "fpext", "fptrunc" ])
+  fp_cast(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "fneg" ])
+  fp_unaryops(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "fadd", "fsub", "fmul", "fdiv" ])
+  fp_binops(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "fcmp" ])
+  fp_cmp(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "select" ])
+  # TODO - select with fcmp
+
+  # TODO - fabs, fsqrt, ceil, floor, trunc, rint, nearbyint
+  # fp_unaryintrinsics()
+
+  # TODO - copysign, maxnum, maxinum, minnum, mininum
+  # fp_binaryintrinsics()
+
+  # TODO - reduction op filtering
+  #if len(targetops) == 0 or "reduce" in targetops:
+    #fp_reductions(maxwidth, [ "fadd", "fmul", "fmax", "fmin" ], cpus)
+
+  ops = filter_ops(targetops, [ "sext", "zext", "trunc" ])
+  int_cast(maxwidth, ops, cpus)
+
+  # TODO - sdiv/udiv/srem/urem (+ by constant/pow2 cases)
+  ops = filter_ops(targetops, [ "and", "or", "xor", "add", "sub", "mul" ])
+  int_binops(maxwidth, ops, cpus)
+
+  # TODO - uniform / constant shift amount costs
+  ops = filter_ops(targetops, [ "shl", "lshr", "ashr" ])
+  int_shifts(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "icmp" ])
+  int_cmp(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "select" ])
+  # TODO - select with icmp
+
+  # TODO - bitcasts i1/i32/i64/float/double
+
+  # TODO - vector ops (extract/insert/shuffle)
+
+  # TODO - better reduction op filtering
+  if len(targetops) == 0 or "reduce" in targetops:
+    int_reductions(maxwidth, [ "and", "or", "xor", "add", "mul", "smax", "smin", "umax", "umin" ], cpus)
+
+  ops = filter_ops(targetops, [ "sitofp", "uitofp" ])
+  int_to_fp(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "fptosi", "fptoui" ])
+  fp_to_int(maxwidth, ops, cpus)
+
+  ops = filter_ops(targetops, [ "bitreverse", "bswap", "ctpop" ])
+  int_unaryintrinsics(maxwidth, ops, cpus)
+  # TODO - ctlz, cttz
+
+  ops = filter_ops(targetops, [ "smax", "smin", "umax", "umin" ])
+  int_binaryintrinsics(maxwidth, ops, cpus)
+
+  # TODO - uniform / constant shift amount costs
+  ops = filter_ops(targetops, [ "fshl", "fshr" ])
+  int_ternaryintrinsics(maxwidth, ops, cpus)
+
+
+def main():
+  # TODO - 2 modes - (a) create generic codegen for sse level and compare cpu analysis
+  #                 (b) create generic codegen for each cpu of a similar level and compare cpu analysis
+  # TODO - How should we test sandybridge (default) on other levels? What about other cpus?
+  cpulevels = {
+    "avx512"  : ( 512, [ "skylake-avx512" ] ),
+    "avx512f" : ( 512, [ "knl" ] ),
+    "avx2"    : ( 256, [ "broadwell", "haswell", "skylake", "znver1", "znver2", "znver3" ] ),
+    "avx1"    : ( 256, [ "bdver2", "btver2", "sandybridge" ] ),
+    "sse4.2"  : ( 128, [ "slm" ] ),
+    "sse4.1"  : ( 128, [ "slm" ] ),
+    "ssse3"   : ( 128, [ "atom" ] ),
+    "sse3"    : ( 128, [ "atom" ] ),
+    "sse2"    : ( 128, [ "atom" ] ),
+  }
+
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument('--triple',
+                      metavar='<triple>',
+                      default='x86_64--',
+                      help='Specify the target triple (default: x86_64--)')
+  parser.add_argument('--cpulevel',
+                      metavar='[ssse3,sse4.2,avx1,avx2,avx512]',
+                      default=None,
+                      help='Only test cpus specific to a cpulevel')
+  # TODO - --op(s) command line handling to select multiple ops for testing
+  parser.add_argument('--op',
+                      metavar='<op>',
+                      default=None,
+                      help='Only test requested op')
+  parser.add_argument('--stop-on-diff',
+                      action='store_true',
+                      help='Stop on first analysis/mca discrepancy, leaves fuzz.ll, analyze.txt, fuzz.s and mca.txt temp files')
+  parser.add_argument('--opt-binary',
+                      metavar='<path>',
+                      default='opt',
+                      help='The "opt" binary to use to analyze the test case IR (default: opt)')
+  parser.add_argument('--llc-binary',
+                      metavar='<path>',
+                      default='llc',
+                      help='The "llc" binary to use to generate the test case assembly (default: llc)')
+  parser.add_argument('--llvm-mca-binary',
+                      metavar='<path>',
+                      default='llvm-mca',
+                      help='The "llvm-mca "binary to use to analyze the test case assembly (default: llvm-mca)')
+
+  global args
+  args = parser.parse_args()
+
+  targetops = list()
+  if args.op != None:
+    targetops = [ args.op ]
+
+  targetcpus = [ "avx512", "avx2", "avx1", "sse4.2", "ssse3" ]
+  if args.cpulevel != None:
+    targetcpus = [ args.cpulevel ]
+
+  for targetcpu in targetcpus:
+    (maxwidth, cpus) = cpulevels[targetcpu]
+    test_cpus(targetops, maxwidth, cpus)
+
+  return 0
+
+if __name__ == '__main__':
+  main()