scripts: Attempted reimp of stack.py using dwarf variable tags

Problem: I misunderstood the purpose of .debug_frames (objdump --dwarf=frames). The purpose of .debug_frames is not to record the size of function stack frames, but to only tell a debugger how to access the previous function's stack frame. It just so happens that this _coincidentally_ tells you the stack frame size when compiling with -fomit-frame-pointer. With -fno-omit-frame-pointer (common on some archs), .debug_frames just says "hey here's the frame pointer" (DW_CFA_def_cfa_register), which tells us nothing about the function's actual stack usage. So unfortunately .debug_frames does not provide enough info on its own... --- This commit was an attempt to find the actual stack usage by looking at the relevant variable info (DW_TAG_variable, etc) in function's dwarf info, but this approach is also not looking very good... 1. The numbers do not appear correct: before -fcallgraph-info=su: 2720 after --dwarf=info: 3558 after --dwarf=info --no-shrinkwrap: 3922 (this is with -fno-omit-frame-pointer) In hindsight, this approach is fundamentally flawed. While the variable tags does give us a lower bound on stack usage, it doesn't tell us about implicit compiler variables and various stack push/pops as a part of expression evaluation. As far as I can tell there's simply not enough info in dwarf info to find an accurate upper bound on stack usage. 2. This approach is quite a bit more complicated, since we need: 1. Dwarf info (--dwarf=info) to find variable tags. 2. Location info (--dwarf=loc) to map var allocations to address ranges. 3. Range info (--dwarf=Ranges) to map lexical blocks to address ranges when var allocation is implicit (not implemented). 4. And we still need frame info (--dwarf=frames)! since var allocations are frame-relative. 3. Also dwarf info is not guaranteed to contain the whole callgraph. It seems callgraph info is actually _omitted_ with -O0?? I guess this is because the callgraph info is a side-effect of some compiler pass? This seems a bit backwards. Dwarf does have a flag (DW_AT_call_all_calls) to indicate when callgraph info is complete, but it doesn't seem to be set reliably? Even with optimizations, lfsr_bd_sync, _and only lfsr_bd_sync_, is missing the DW_AT_call_all_calls flag. I have no idea why. The flag is still present in lfsr_bd_erase, lfsr_bd_read, and other functions with function pointers... So I think this will probably be reverted.
2025-12-01 12:20:02 +00:00 · 2024-12-08 13:27:15 -06:00
parent 14e7501e5c
commit 0e658b8246
1 changed files with 332 additions and 30 deletions
--- a/scripts/stack.py
+++ b/scripts/stack.py
@ -21,6 +21,7 @@ import math as mt
 import os
 import re
 import subprocess as sp
+import bisect


 OBJDUMP_PATH = ['objdump']
@ -577,7 +578,7 @@ def collect_dwarf_info(obj_path, tags=None, *,

    return DwarfInfo(info)

-class Frame(co.namedtuple('Sym', ['addr', 'frame'])):
+class Frame(co.namedtuple('Frame', ['addr', 'frame'])):
    __slots__ = ()
    def __new__(cls, addr, frame):
        return super().__new__(cls, addr, frame)
@ -751,19 +752,244 @@ def collect_dwarf_frames(obj_path, tags=None, *,
            elif op in {
                    'DW_CFA_nop',
                    'DW_CFA_offset',
-                    'DW_CFA_restore'}:
+                    'DW_CFA_restore',
+                    'DW_CFA_def_cfa_register'}:
                pass
            else:
                assert False, "Unknown frame op? %r" % op

    return FrameInfo(frames)

+class Loc(co.namedtuple('Loc', ['addr', 'size', 'ops'])):
+    __slots__ = ()
+    def __new__(cls, addr, size, ops):
+        return super().__new__(cls, addr, size, ops)
+
+    def __repr__(self):
+        return '%s(0x%x, 0x%x, %r)' % (
+                self.__class__.__name__,
+                self.addr,
+                self.size,
+                self.ops)
+
+class LocList:
+    def __init__(self, off, locs):
+        self.off = off
+        self.locs = locs
+
+    def get(self, k, d=None):
+        import bisect
+
+        # organize by address
+        if not hasattr(self, '_by_addr'):
+            # sort and keep largest/first when duplicates
+            locs = self.locs.copy()
+            locs.sort(key=lambda x: (x.addr, -x.size))
+
+            by_addr = []
+            for loc in locs:
+                if (len(by_addr) == 0
+                        or by_addr[-1].addr != loc.addr):
+                    by_addr.append(loc)
+            self._by_addr = by_addr
+
+        # find loc by range
+        i = bisect.bisect(self._by_addr, k,
+                key=lambda x: x.addr)
+        # check that we're actually in this loc's size
+        if i > 0 and k < self._by_addr[i-1].addr+self._by_addr[i-1].size:
+            return self._by_addr[i-1]
+        else:
+            return d
+
+    def __getitem__(self, k):
+        v = self.get(k)
+        if v is None:
+            raise KeyError(k)
+        return v
+
+    def __contains__(self, k):
+        return self.get(k) is not None
+
+    def __len__(self):
+        return len(self.locs)
+
+    def __iter__(self):
+        return iter(self.locs)
+
+def collect_dwarf_locs(obj_path, tags=None, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    loc_pattern = re.compile(
+            '^\s*(?P<begin_off>[0-9a-fA-F]+)'
+                    '\s+(?P<begin_start>v?[0-9a-fA-F]+)'
+                    '\s+(?P<begin_stop>v?[0-9a-fA-F]+)'
+                    '\s+views.*$'
+                '|' '^\s*(?P<end_off>[0-9a-fA-F]+)'
+                    '\s+<End of list>\s*$'
+                '|' '^\s*(?P<loc_start>[0-9a-fA-F]+)'
+                    '\s+(?P<loc_stop>[0-9a-fA-F]+)'
+                    '\s+\((?P<loc_ops>.+)\)\s*$',
+            re.IGNORECASE)
+
+    # collect location lists
+    locs = co.OrderedDict()
+    list_off = None
+    list_locs = None
+    # note objdump-path may contain extra args
+    cmd = objdump_path + ['--dwarf=loc', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        # find localtion lists
+        m = loc_pattern.match(line)
+        if m:
+            # start of list?
+            if m.group('begin_off'):
+                # these occur between every entry, so ignore after
+                # the first one
+                if list_off is None:
+                    list_off = int(m.group('begin_off'), 16)
+                    list_locs = []
+            # end of list?
+            elif m.group('end_off'):
+                assert list_off is not None
+                locs[list_off] = LocList(list_off, list_locs)
+                list_off = None
+                list_locs = None
+            # found a loc?
+            elif m.group('loc_start'):
+                assert list_off is not None
+                start = int(m.group('loc_start'), 16)
+                stop = int(m.group('loc_stop'), 16)
+                ops = [op.strip() for op in m.group('loc_ops').split(';')]
+                list_locs.append(Loc(start, stop-start, ops))
+            else:
+                assert False
+    proc.wait()
+    if proc.returncode != 0:
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    return locs
+
+# we basically need a small linker here
+class Func(co.namedtuple('Func', ['file', 'sym', 'entry',
+        'frames', 'calls'])):
+    __slots__ = ()
+    def __new__(cls, file, sym, entry, frames=None, calls=None):
+        return super().__new__(cls, file, sym, entry,
+                frames if frames is not None else FuncFrameInfo(),
+                calls if calls is not None else co.OrderedDict())
+
+    def __repr__(self):
+        return '<%s %s>' % (
+                self.__class__.__name__,
+                self.sym.name)
+
+class FuncFrame(co.namedtuple('FuncFrame', ['addr', 'size', 'frame'])):
+    __slots__ = ()
+    def __new__(cls, addr, size, frame):
+        return super().__new__(cls, addr, size, frame)
+
+    def __repr__(self):
+        return '%s(0x%x, 0x%x, %d)' % (
+                self.__class__.__name__,
+                self.addr,
+                self.size,
+                self.frame)
+
+class FuncFrameInfo:
+    def __init__(self, frames=None):
+        self.frames = frames if frames is not None else []
+
+    def get(self, k, d=None):
+        # find frame by address
+        i = bisect.bisect(self.frames, k,
+                key=lambda x: x.addr)
+        # check that we're actually in this frame's size
+        if i > 0 and k < self.frames[i-1].addr+self.frames[i-1].size:
+            return self.frames[i-1]
+        else:
+            return d
+
+    def set(self, k, v):
+        # always operate on ranges
+        if not isinstance(k, slice):
+            k = slice(k, 1)
+
+        # insert frame, merging frames to find max frames
+        frames_ = []
+        for f in it.chain(
+                (f for f in self.frames if f.addr < k.start),
+                [FuncFrame(k.start, k.stop-k.start, v)],
+                (f for f in self.frames if f.addr >= k.start)):
+            g = frames_[-1] if frames_ else None
+
+            # new frame?
+            if g is None or f.addr > g.addr+g.size:
+                frames_.append(f)
+            # merge with previous frame?
+            elif f.frame == g.frame:
+                frames_[-1] = FuncFrame(
+                        g.addr,
+                        max(g.size, (f.addr+f.size) - g.addr),
+                        g.frame)
+            # previous frame wins?
+            elif f.frame < g.frame:
+                # slice new frame?
+                if (f.addr+f.size > g.addr+g.size):
+                    frames_.append(FuncFrame(
+                            g.addr+g.size,
+                            f.addr+f.size - (g.addr+g.size),
+                            f.frame))
+            # new frame wins?
+            elif f.frame > g.frame:
+                # slice previous frame
+                frames_[-1] = FuncFrame(
+                        g.addr,
+                        f.addr - g.addr,
+                        g.frame)
+                # append new frame
+                frames_.append(f)
+                # slice previous frame tail?
+                if (f.addr+f.size < g.addr+g.size):
+                    frames_.append(FuncFrame(
+                            f.addr+f.size,
+                            (g.addr+g.size) - (f.addr+f.size),
+                            g.frame))
+
+        self.frames = frames_
+
+    def __getitem__(self, k):
+        v = self.get(k)
+        if v is None:
+            raise KeyError(k)
+        return v
+
+    def __contains__(self, k):
+        return self.get(k) is not None
+
+    def __setitem__(self, k, v):
+        return self.set(k, v)
+
+    def __len__(self):
+        return len(self.frames)
+
+    def __iter__(self):
+        return iter(self.frames)
+
 def collect(obj_paths, *,
        sources=None,
        everything=False,
        **args):
    funcs = []
    globals = co.OrderedDict()
+    incomplete = False
    for obj_path in obj_paths:
        # find relevant symbols
        syms = collect_syms(obj_path,
@ -781,6 +1007,9 @@ def collect(obj_paths, *,
        # find frame info
        frames = collect_dwarf_frames(obj_path, **args)

+        # find location info
+        locs = collect_dwarf_locs(obj_path, **args)
+
        # find the max stack frame for each function
        locals = co.OrderedDict()
        for sym in syms:
@ -810,14 +1039,64 @@ def collect(obj_paths, *,
                        os.path.abspath(file)]) == os.getcwd():
                    continue

-            # find the stack frames for each function
-            frames_ = frames[sym.addr:sym.addr+sym.size]
+            # build our func
+            func = Func(file, sym, entry)

-            func = {'file': file,
-                    'sym': sym,
-                    'entry': entry,
-                    'frames': frames_,
-                    'calls': []}
+            # find the relevant stack frames
+            if entry is not None:
+                # base frame
+                func.frames[sym.addr:sym.addr+sym.size] = max(
+                        (frame.frame
+                            for frame in frames[sym.addr:sym.addr+sym.size]),
+                        default=0)
+
+                for var in entry.info():
+                    # find stack usage of relevant variables
+                    if var.tag in {
+                            'DW_TAG_variable',
+                            'DW_TAG_formal_parameter',
+                            'DW_TAG_call_site_parameter'}:
+                        # ignore vars with no location, these are usually
+                        # globals or synthetic variables
+                        if 'DW_AT_location' not in var:
+                            continue
+
+                        m = re.match(
+                                '^\s*(?P<list>[0xX0-9a-fA-F]+)'
+                                        '\s*\(.*\)\s*$'
+                                    '|' '^.*?\((?P<ops>.*)\)\s*$',
+                                var['DW_AT_location'])
+                        if m.group('ops'):
+                            # TODO use range of lexical_block
+                            locs_ = [Loc(sym.addr, sym.size,
+                                    [m.group('ops').strip()])]
+                        elif m.group('list'):
+                            locs_ = locs[int(m.group('list'), 0)]
+                        else:
+                            assert False, "Unknown loc? %r" % (
+                                    var['DW_AT_location'])
+
+                        for loc in locs_:
+                            frame = frames[loc.addr]
+                            for op in loc.ops:
+                                if op.startswith('DW_OP_fbreg'):
+                                    off = int(op.split(':')[-1].strip(), 0)
+                                    func.frames[loc.addr:loc.addr+loc.size] = (
+                                            frame.frame - off)
+                    # ignore these
+                    elif var.tag in {
+                            'DW_TAG_lexical_block',
+                            'DW_TAG_inlined_subroutine',
+                            'DW_TAG_call_site',
+                            'DW_TAG_label',
+                            'DW_TAG_structure_type',
+                            'DW_TAG_union_type',
+                            'DW_TAG_member'}:
+                        pass
+                    else:
+                        assert False, "Unknown frame tag? %r" % var.tag
+
+            # keep track of funcs
            funcs.append(func)

            # keep track of locals/globals
@ -827,11 +1106,22 @@ def collect(obj_paths, *,
                locals[entry.off] = func

        # link local function calls via dwarf entries
-        for caller in locals.values():
-            if not caller['entry']:
+        for func in locals.values():
+            if not func.entry:
                continue

-            for call in caller['entry'].info(
+            if ((args.get('warn_on_incomplete')
+                        or args.get('error_on_incomplete'))
+                    and 'DW_AT_call_all_calls' not in func.entry):
+                print('%s: incomplete call info in %s '
+                        '(DW_AT_call_all_calls missing)' % (
+                            'error' if args.get('error_on_incomplete')
+                                else 'warning',
+                            func.sym.name),
+                        file=sys.stderr)
+                incomplete = True
+
+            for call in func.entry.info(
                    tags={'DW_TAG_call_site'}):
                if ('DW_AT_call_return_pc' not in call
                        or 'DW_AT_call_origin' not in call):
@ -849,24 +1139,24 @@ def collect(obj_paths, *,

                # callee in locals?
                if off in locals:
-                    callee = locals[off]
+                    func.calls[addr] = locals[off]
                else:
                    # if not, just keep track of the symbol and try to link
                    # during the global pass
-                    callee = info[off].name
+                    func.calls[addr] = info[off].name

-                caller['calls'].append((addr, callee))
+    # error on incomplete calls after printing all relevant functions
+    if args.get('error_on_incomplete') and incomplete:
+        sys.exit(3)

    # link global function calls via symbol
-    for caller in funcs:
-        calls_ = []
-        for addr, callee in caller['calls']:
+    for func in funcs:
+        for addr, callee in func.calls.copy().items():
            if isinstance(callee, str):
                if callee in globals:
-                    calls_.append((addr, globals[callee]))
-            else:
-                calls_.append((addr, callee))
-        caller['calls'] = calls_
+                    func.calls[addr] = globals[callee]
+                else:
+                    del func.calls[addr]

    # recursive+cached limit finder
    def limitof(func, seen=set()):
@ -880,16 +1170,20 @@ def collect(obj_paths, *,
            return limitof.cache[id(func)]

        # find max stack frame
-        frame = max((frame.frame for frame in func['frames']), default=0)
+        frame = max((frame.frame for frame in func.frames), default=0)

        # find stack limit recursively
        limit = frame
-        for addr, callee in func['calls']:
+        for addr, callee in func.calls.items():
            if args.get('no_shrinkwrap'):
                frame_ = frame
            else:
                # use stack frame at call site
-                frame_ = func['frames'][addr].frame
+                frame_ = func.frames.get(addr)
+                if frame_ is not None:
+                    frame_ = frame_.frame
+                else:
+                    frame_ = 0

            _, limit_ = limitof(callee, seen | {id(func)})

@ -912,9 +1206,9 @@ def collect(obj_paths, *,
        # find children recursively
        children = []
        dirty = False
-        for addr, callee in func['calls']:
-            file_ = callee['file']
-            name_ = callee['sym'].name
+        for addr, callee in func.calls.items():
+            file_ = callee.file
+            name_ = callee.sym.name
            frame_, limit_ = limitof(callee, seen | {id(func)})
            children_, notes_, dirty_ = childrenof(callee, seen | {id(func)})
            dirty = dirty or dirty_
@ -929,8 +1223,8 @@ def collect(obj_paths, *,
    # build results
    results = []
    for func in funcs:
-        file = func['file']
-        name = func['sym'].name
+        file = func.file
+        name = func.sym.name
        frame, limit = limitof(func)
        children, notes, _ = childrenof(func)

@ -1545,6 +1839,14 @@ if __name__ == "__main__":
            '-e', '--error-on-recursion',
            action='store_true',
            help="Error if any functions are recursive.")
+    parser.add_argument(
+            '--warn-on-incomplete',
+            action='store_true',
+            help="Warn if callgraph may be incomplete.")
+    parser.add_argument(
+            '--error-on-incomplete',
+            action='store_true',
+            help="Error if callgraph may be incomplete.")
    parser.add_argument(
            '--objdump-path',
            type=lambda x: x.split(),