#!/usr/bin/env python3
"""Calculate average sizes from bloaty CSV or TinyUSB metrics JSON outputs."""

import argparse
import csv
import glob
import io
import json
import os
import sys
from collections import defaultdict


def expand_files(file_patterns):
    """Expand file patterns (globs) to list of files.

    Args:
        file_patterns: List of file paths or glob patterns

    Returns:
        List of expanded file paths
    """
    expanded = []
    for pattern in file_patterns:
        if '*' in pattern or '?' in pattern:
            expanded.extend(glob.glob(pattern))
        else:
            expanded.append(pattern)
    return expanded


def parse_bloaty_csv(csv_text, filters=None):
    """Parse bloaty CSV text and return normalized JSON data structure."""

    filters = filters or []
    reader = csv.DictReader(io.StringIO(csv_text))
    size_by_unit = defaultdict(int)
    symbols_by_unit: dict[str, defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))
    sections_by_unit: dict[str, defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))

    for row in reader:
        compile_unit = row.get("compileunits") or row.get("compileunit") or row.get("path")
        if compile_unit is None:
            continue

        if str(compile_unit).upper() == "TOTAL":
            continue

        if filters and not any(filt in compile_unit for filt in filters):
            continue

        try:
            vmsize = int(row.get("vmsize", 0))
        except ValueError:
            continue

        size_by_unit[compile_unit] += vmsize
        symbol_name = row.get("symbols", "")
        if symbol_name:
            symbols_by_unit[compile_unit][symbol_name] += vmsize
        section_name = row.get("sections") or row.get("section")
        if section_name and vmsize:
            sections_by_unit[compile_unit][section_name] += vmsize

    files = []
    for unit_path, total_size in size_by_unit.items():
        symbols = [
            {"name": sym, "size": sz}
            for sym, sz in sorted(symbols_by_unit[unit_path].items(), key=lambda x: x[1], reverse=True)
        ]
        sections = {sec: sz for sec, sz in sections_by_unit[unit_path].items() if sz}
        files.append(
            {
                "file": os.path.basename(unit_path) or unit_path,
                "path": unit_path,
                "size": total_size,
                "symbols": symbols,
                "sections": sections,
            }
        )

    total_all = sum(size_by_unit.values())
    return {"files": files, "TOTAL": total_all}


def combine_files(input_files, filters=None):
    """Combine multiple metrics inputs (bloaty CSV or metrics JSON) into a single data set."""

    filters = filters or []
    all_json_data = {"file_list": [], "data": []}

    for fin in input_files:
        if not os.path.exists(fin):
            print(f"Warning: {fin} not found, skipping", file=sys.stderr)
            continue

        try:
            if fin.endswith(".json"):
                with open(fin, "r", encoding="utf-8") as f:
                    json_data = json.load(f)
                if filters:
                    json_data["files"] = [
                        f
                        for f in json_data.get("files", [])
                        if f.get("path") and any(filt in f["path"] for filt in filters)
                    ]
            elif fin.endswith(".csv"):
                with open(fin, "r", encoding="utf-8") as f:
                    csv_text = f.read()
                json_data = parse_bloaty_csv(csv_text, filters)
            else:
                if fin.endswith(".elf"):
                    print(f"Warning: {fin} is an ELF; please run bloaty with --csv output first. Skipping.",
                          file=sys.stderr)
                else:
                    print(f"Warning: {fin} is not a supported CSV or JSON metrics input. Skipping.",
                          file=sys.stderr)
                continue

            # Drop any fake TOTAL entries that slipped in as files
            json_data["files"] = [
                f for f in json_data.get("files", [])
                if str(f.get("file", "")).upper() != "TOTAL"
            ]

            all_json_data["file_list"].append(fin)
            all_json_data["data"].append(json_data)
        except Exception as e:  # pragma: no cover - defensive
            print(f"Warning: Failed to analyze {fin}: {e}", file=sys.stderr)
            continue

    return all_json_data


def compute_avg(all_json_data):
    """Compute average sizes from combined json_data.

    Args:
        all_json_data: Dictionary with file_list and data from combine_files()

    Returns:
        json_average: Dictionary with averaged size data
    """
    if not all_json_data["data"]:
        return None

    # Merge files with the same 'file' value and compute averages
    file_accumulator = {}  # key: file name, value: {"sizes": [sizes], "symbols": {name: [sizes]}, "sections": {name: [sizes]}}

    for json_data in all_json_data["data"]:
        for f in json_data.get("files", []):
            fname = f["file"]
            if fname not in file_accumulator:
                file_accumulator[fname] = {
                    "sizes": [],
                    "path": f.get("path"),
                    "symbols": defaultdict(list),
                    "sections": defaultdict(list),
                }
            size_val = f.get("size", 0)
            file_accumulator[fname]["sizes"].append(size_val)
            for sym in f.get("symbols", []):
                name = sym.get("name")
                if name is None:
                    continue
                file_accumulator[fname]["symbols"][name].append(sym.get("size", 0))
            sections_map = f.get("sections") or {}
            for sname, ssize in sections_map.items():
                file_accumulator[fname]["sections"][sname].append(ssize)

    # Build json_average with averaged values
    files_average = []
    for fname, data in file_accumulator.items():
        avg_size = round(sum(data["sizes"]) / len(data["sizes"])) if data["sizes"] else 0
        symbols_avg = []
        for sym_name, sizes in data["symbols"].items():
            if not sizes:
                continue
            symbols_avg.append({"name": sym_name, "size": round(sum(sizes) / len(sizes))})
        symbols_avg.sort(key=lambda x: x["size"], reverse=True)
        sections_avg = {
            sec_name: round(sum(sizes) / len(sizes))
            for sec_name, sizes in data["sections"].items()
            if sizes
        }
        files_average.append(
            {
                "file": fname,
                "path": data["path"],
                "size": avg_size,
                "symbols": symbols_avg,
                "sections": sections_avg,
            }
        )

    total_size = sum(f["size"] for f in files_average) or 1

    for f in files_average:
        f["percent"] = (f["size"] / total_size) * 100 if total_size else 0
        for sym in f["symbols"]:
            sym["percent"] = (sym["size"] / f["size"]) * 100 if f["size"] else 0

    json_average = {
        "file_list": all_json_data["file_list"],
        "files": files_average,
    }

    return json_average


def compare_files(base_file, new_file, filters=None):
    """Compare two CSV or JSON inputs and generate difference report."""
    filters = filters or []

    base_avg = compute_avg(combine_files([base_file], filters))
    new_avg = compute_avg(combine_files([new_file], filters))

    if not base_avg or not new_avg:
        return None

    base_files = {f["file"]: f for f in base_avg["files"]}
    new_files = {f["file"]: f for f in new_avg["files"]}
    all_file_names = set(base_files.keys()) | set(new_files.keys())

    comparison_files = []
    for fname in sorted(all_file_names):
        b = base_files.get(fname, {})
        n = new_files.get(fname, {})
        b_size = b.get("size", 0)
        n_size = n.get("size", 0)
        base_sections = b.get("sections") or {}
        new_sections = n.get("sections") or {}

        # Symbol diffs
        b_syms = {s["name"]: s for s in b.get("symbols", [])}
        n_syms = {s["name"]: s for s in n.get("symbols", [])}
        all_syms = set(b_syms.keys()) | set(n_syms.keys())
        symbols = []
        for sym in all_syms:
            sb = b_syms.get(sym, {}).get("size", 0)
            sn = n_syms.get(sym, {}).get("size", 0)
            symbols.append({"name": sym, "base": sb, "new": sn, "diff": sn - sb})
        symbols.sort(key=lambda x: abs(x["diff"]), reverse=True)

        comparison_files.append({
            "file": fname,
            "size": {"base": b_size, "new": n_size, "diff": n_size - b_size},
            "symbols": symbols,
            "sections": {
                name: {
                    "base": base_sections.get(name, 0),
                    "new": new_sections.get(name, 0),
                    "diff": new_sections.get(name, 0) - base_sections.get(name, 0),
                }
                for name in sorted(set(base_sections) | set(new_sections))
            },
        })

    base_total = sum(f["size"] for f in base_avg["files"])
    new_total = sum(f["size"] for f in new_avg["files"])
    total = {
        "base": base_total,
        "new": new_total,
        "diff": new_total - base_total,
    }

    return {
        "base_file": base_file,
        "new_file": new_file,
        "total": total,
        "files": comparison_files,
    }


def get_sort_key(sort_order):
    """Get sort key function based on sort order.

    Args:
        sort_order: One of 'size-', 'size+', 'name-', 'name+'

    Returns:
        Tuple of (key_func, reverse)
    """

    def _size_val(entry):
        return entry.get('size', 0)

    if sort_order == 'size-':
        return _size_val, True
    elif sort_order == 'size+':
        return _size_val, False
    elif sort_order == 'name-':
        return lambda x: x.get('file', ''), True
    else:  # name+
        return lambda x: x.get('file', ''), False


def format_diff(base, new, diff):
    """Format a diff value with percentage."""
    if diff == 0:
        return f"{new}"
    if base == 0 or new == 0:
        return f"{base} ➙ {new}"
    pct = (diff / base) * 100
    sign = "+" if diff > 0 else ""
    return f"{base} ➙ {new} ({sign}{diff}, {sign}{pct:.1f}%)"


def write_json_output(json_data, path):
    """Write JSON output with indentation."""

    with open(path, "w", encoding="utf-8") as outf:
        json.dump(json_data, outf, indent=2)


def render_combine_table(json_data, sort_order='name+'):
    """Render averaged sizes as markdown table lines (no title)."""
    files = json_data.get("files", [])
    if not files:
        return ["No entries."]

    key_func, reverse = get_sort_key(sort_order)
    files_sorted = sorted(files, key=key_func, reverse=reverse)

    total_size = json_data.get("TOTAL") or sum(f.get("size", 0) for f in files_sorted)

    pct_strings = [
        f"{(f.get('percent') if f.get('percent') is not None else (f.get('size', 0) / total_size * 100 if total_size else 0)):.1f}%"
        for f in files_sorted]
    pct_width = 6
    size_width = max(len("size"), *(len(str(f.get("size", 0))) for f in files_sorted), len(str(total_size)))
    file_width = max(len("File"), *(len(f.get("file", "")) for f in files_sorted), len("TOTAL"))

    # Build section totals on the fly from file data
    sections_global = defaultdict(int)
    for f in files_sorted:
        for name, size in (f.get("sections") or {}).items():
            sections_global[name] += size
    # Display sections in reverse alphabetical order for stable column layout
    section_names = sorted(sections_global.keys(), reverse=True)
    section_widths = {}
    for name in section_names:
        max_val = max((f.get("sections", {}).get(name, 0) for f in files_sorted), default=0)
        section_widths[name] = max(len(name), len(str(max_val)), 1)

    if not section_names:
        header = f"| {'File':<{file_width}} | {'size':>{size_width}} | {'%':>{pct_width}} |"
        separator = f"| :{'-' * (file_width - 1)} | {'-' * (size_width - 1)}: | {'-' * (pct_width - 1)}: |"
    else:
        header_parts = [f"| {'File':<{file_width}} |"]
        sep_parts = [f"| :{'-' * (file_width - 1)} |"]
        for name in section_names:
            header_parts.append(f" {name:>{section_widths[name]}} |")
            sep_parts.append(f" {'-' * (section_widths[name] - 1)}: |")
        header_parts.append(f" {'size':>{size_width}} | {'%':>{pct_width}} |")
        sep_parts.append(f" {'-' * (size_width - 1)}: | {'-' * (pct_width - 1)}: |")
        header = "".join(header_parts)
        separator = "".join(sep_parts)

    lines = [header, separator]

    for f, pct_str in zip(files_sorted, pct_strings):
        size_val = f.get("size", 0)
        parts = [f"| {f.get('file', ''):<{file_width}} |"]
        if section_names:
            sections_map = f.get("sections") or {}
            for name in section_names:
                parts.append(f" {sections_map.get(name, 0):>{section_widths[name]}} |")
        parts.append(f" {size_val:>{size_width}} | {pct_str:>{pct_width}} |")
        lines.append("".join(parts))

    total_parts = [f"| {'TOTAL':<{file_width}} |"]
    if section_names:
        for name in section_names:
            total_parts.append(f" {sections_global.get(name, 0):>{section_widths[name]}} |")
    total_parts.append(f" {total_size:>{size_width}} | {'100.0%':>{pct_width}} |")
    lines.append("".join(total_parts))
    return lines


def write_combine_markdown(json_data, path, sort_order='name+', title="TinyUSB Average Code Size Metrics"):
    """Write averaged size data to a markdown file."""

    md_lines = [f"# {title}", ""]
    md_lines.extend(render_combine_table(json_data, sort_order))
    md_lines.append("")

    if json_data.get("file_list"):
        md_lines.extend(["<details>", "<summary>Input files</summary>", ""])
        md_lines.extend([f"- {mf}" for mf in json_data["file_list"]])
        md_lines.extend(["", "</details>", ""])

    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(md_lines))


def write_compare_markdown(comparison, path, sort_order='size'):
    """Write comparison data to markdown file."""
    md_lines = [
        "# Size Difference Report",
        "",
        "Because TinyUSB code size varies by port and configuration, the metrics below represent the averaged totals across all example builds.",
        "",
        "Note: If there is no change, only one value is shown.",
        "",
    ]

    significant, minor, unchanged = _split_by_significance(comparison["files"], sort_order)

    def render(title, rows, collapsed=False):
        if collapsed:
            md_lines.append(f"<details><summary>{title}</summary>")
            md_lines.append("")
        else:
            md_lines.append(f"## {title}")

        md_lines.extend(render_compare_table(_build_rows(rows, sort_order), include_sum=True))
        md_lines.append("")

        if collapsed:
            md_lines.append("</details>")
            md_lines.append("")

    render("Changes >1% in size", significant)
    render("Changes <1% in size", minor)
    render("No changes", unchanged, collapsed=True)

    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(md_lines))


def print_compare_summary(comparison, sort_order='name+'):
    """Print diff report to stdout in table form."""

    files = comparison["files"]

    rows = _build_rows(files, sort_order)
    lines = render_compare_table(rows, include_sum=True)
    for line in lines:
        print(line)


def _build_rows(files, sort_order):
    """Sort files and prepare printable fields."""

    def sort_key(file_row):
        if sort_order == 'size-':
            return abs(file_row["size"]["diff"])
        if sort_order in ('size', 'size+'):
            return abs(file_row["size"]["diff"])
        if sort_order == 'name-':
            return file_row['file']
        return file_row['file']

    reverse = sort_order in ('size-', 'name-')
    files_sorted = sorted(files, key=sort_key, reverse=reverse)

    rows = []
    for f in files_sorted:
        sd = f["size"]
        diff_val = sd['new'] - sd['base']
        if sd['base'] == 0:
            pct_str = "n/a"
        else:
            pct_val = (diff_val / sd['base']) * 100
            pct_str = f"{pct_val:+.1f}%"
        rows.append({
            "file": f['file'],
            "base": sd['base'],
            "new": sd['new'],
            "diff": diff_val,
            "pct": pct_str,
            "sections": f.get("sections", {}),
        })
    return rows


def _split_by_significance(files, sort_order):
    """Split files into >1% changes, <1% changes, and no changes."""

    def is_significant(file_row):
        base = file_row["size"]["base"]
        diff = abs(file_row["size"]["diff"])
        if base == 0:
            return diff != 0
        return (diff / base) * 100 > 1.0

    rows_sorted = sorted(
        files,
        key=lambda f: abs(f["size"]["diff"]) if sort_order.startswith("size") else f["file"],
        reverse=sort_order in ('size-', 'name-'),
    )

    significant = []
    minor = []
    unchanged = []
    for f in rows_sorted:
        if f["size"]["diff"] == 0:
            unchanged.append(f)
        else:
            (significant if is_significant(f) else minor).append(f)

    return significant, minor, unchanged


def render_compare_table(rows, include_sum):
    """Return markdown table lines for given rows."""
    if not rows:
        return ["No entries.", ""]

    # collect section columns (reverse alpha)
    section_names = sorted(
        {name for r in rows for name in (r.get("sections") or {})},
        reverse=True,
    )

    def fmt_abs(val_old, val_new):
        diff = val_new - val_old
        if diff == 0:
            return f"{val_new}"
        sign = "+" if diff > 0 else ""
        return f"{val_old} ➙ {val_new} ({sign}{diff})"

    sum_base = sum(r["base"] for r in rows)
    sum_new = sum(r["new"] for r in rows)
    total_diff = sum_new - sum_base
    total_pct = "n/a" if sum_base == 0 else f"{(total_diff / sum_base) * 100:+.1f}%"

    file_width = max(len("file"), *(len(r["file"]) for r in rows), len("TOTAL"))
    size_width = max(
        len("size"),
        *(len(fmt_abs(r["base"], r["new"])) for r in rows),
        len(fmt_abs(sum_base, sum_new)),
    )
    pct_width = max(len("% diff"), *(len(r["pct"]) for r in rows), len(total_pct))
    section_widths = {}
    for name in section_names:
        max_val_len = 0
        for r in rows:
            sec_entry = (r.get("sections") or {}).get(name, {"base": 0, "new": 0})
            max_val_len = max(max_val_len, len(fmt_abs(sec_entry.get("base", 0), sec_entry.get("new", 0))))
        section_widths[name] = max(len(name), max_val_len, 1)

    header_parts = [f"| {'file':<{file_width}} |"]
    sep_parts = [f"| :{'-' * (file_width - 1)} |"]
    for name in section_names:
        header_parts.append(f" {name:>{section_widths[name]}} |")
        sep_parts.append(f" {'-' * (section_widths[name] - 1)}: |")
    header_parts.append(f" {'size':>{size_width}} | {'% diff':>{pct_width}} |")
    sep_parts.append(f" {'-' * (size_width - 1)}: | {'-' * (pct_width - 1)}: |")
    header = "".join(header_parts)
    separator = "".join(sep_parts)

    lines = [header, separator]

    for r in rows:
        parts = [f"| {r['file']:<{file_width}} |"]
        sections_map = r.get("sections") or {}
        for name in section_names:
            sec_entry = sections_map.get(name, {"base": 0, "new": 0})
            parts.append(f" {fmt_abs(sec_entry.get('base', 0), sec_entry.get('new', 0)):>{section_widths[name]}} |")
        parts.append(f" {fmt_abs(r['base'], r['new']):>{size_width}} | {r['pct']:>{pct_width}} |")
        lines.append("".join(parts))

    if include_sum:
        total_parts = [f"| {'TOTAL':<{file_width}} |"]
        for name in section_names:
            total_base = sum((r.get("sections") or {}).get(name, {}).get("base", 0) for r in rows)
            total_new = sum((r.get("sections") or {}).get(name, {}).get("new", 0) for r in rows)
            total_parts.append(f" {fmt_abs(total_base, total_new):>{section_widths[name]}} |")
        total_parts.append(f" {fmt_abs(sum_base, sum_new):>{size_width}} | {total_pct:>{pct_width}} |")
        lines.append("".join(total_parts))
    return lines


def cmd_combine(args):
    """Handle combine subcommand."""
    input_files = expand_files(args.files)
    all_json_data = combine_files(input_files, args.filters)
    json_average = compute_avg(all_json_data)

    if json_average is None:
        print("No valid map files found", file=sys.stderr)
        sys.exit(1)

    if not args.quiet:
        for line in render_combine_table(json_average, sort_order=args.sort):
            print(line)
    if args.json_out:
        write_json_output(json_average, args.out + '.json')
    if args.markdown_out:
        write_combine_markdown(json_average, args.out + '.md', sort_order=args.sort,
                               title="TinyUSB Average Code Size Metrics")


def cmd_compare(args):
    """Handle compare subcommand."""
    comparison = compare_files(args.base, args.new, args.filters)

    if comparison is None:
        print("Failed to compare files", file=sys.stderr)
        sys.exit(1)

    if not args.quiet:
        print_compare_summary(comparison, args.sort)
    if args.markdown_out:
        write_compare_markdown(comparison, args.out + '.md', args.sort)
        if not args.quiet:
            print(f"Comparison written to {args.out}.md")


def main(argv=None):
    parser = argparse.ArgumentParser(description='Code size metrics tool')
    subparsers = parser.add_subparsers(dest='command', required=True, help='Available commands')

    # Combine subcommand
    combine_parser = subparsers.add_parser('combine', help='Combine and average bloaty CSV outputs or metrics JSON files')
    combine_parser.add_argument('files', nargs='+',
                                help='Path to bloaty CSV output or TinyUSB metrics JSON file(s) (including linkermap-generated) or glob pattern(s)')
    combine_parser.add_argument('-f', '--filter', dest='filters', action='append', default=[],
                                help='Only include compile units whose path contains this substring (can be repeated)')
    combine_parser.add_argument('-o', '--out', dest='out', default='metrics',
                                help='Output path basename for JSON and Markdown files (default: metrics)')
    combine_parser.add_argument('-j', '--json', dest='json_out', action='store_true',
                                help='Write JSON output file')
    combine_parser.add_argument('-m', '--markdown', dest='markdown_out', action='store_true',
                                help='Write Markdown output file')
    combine_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true',
                                help='Suppress summary output')
    combine_parser.add_argument('-S', '--sort', dest='sort', default='size-',
                                choices=['size', 'size-', 'size+', 'name', 'name-', 'name+'],
                                help='Sort order: size/size- (descending), size+ (ascending), name/name+ (ascending), name- (descending). Default: size-')

    # Compare subcommand
    compare_parser = subparsers.add_parser('compare', help='Compare two metrics inputs (bloaty CSV or metrics JSON)')
    compare_parser.add_argument('base', help='Base CSV/metrics JSON file')
    compare_parser.add_argument('new', help='New CSV/metrics JSON file')
    compare_parser.add_argument('-f', '--filter', dest='filters', action='append', default=[],
                                help='Only include compile units whose path contains this substring (can be repeated)')
    compare_parser.add_argument('-o', '--out', dest='out', default='metrics_compare',
                                help='Output path basename for Markdown/JSON files (default: metrics_compare)')
    compare_parser.add_argument('-m', '--markdown', dest='markdown_out', action='store_true',
                                help='Write Markdown output file')
    compare_parser.add_argument('-S', '--sort', dest='sort', default='name+',
                                choices=['size', 'size-', 'size+', 'name', 'name-', 'name+'],
                                help='Sort order: size/size- (descending), size+ (ascending), name/name+ (ascending), name- (descending). Default: name+')
    compare_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true',
                                help='Suppress stdout summary output')

    args = parser.parse_args(argv)

    if args.command == 'combine':
        cmd_combine(args)
    elif args.command == 'compare':
        cmd_compare(args)


if __name__ == '__main__':
    main()