tvl-depot/third_party/bazel/rules_haskell/debug/linking_utils/ldd.py

import subprocess
import os
import sys
import re


### helper functions

def list_to_dict(f, l):
    """dict with elements of list as keys & as values transformed by f"""
    d = {}
    for el in l:
        d[el] = f(el)
    return d

def dict_remove_empty(d):
    """remove keys that have [] or {} or as values"""
    new = {}
    for k, v in d.items():
        if not (v == [] or v == {}):
             new[k] = v
    return new

def identity(x):
    """identity function"""
    return x

def const(x):
    """(curried) constant function"""
    def f(y):
        return x
    return f

def memoized(cache, f, arg):
    """Memoizes a call to `f` with `arg` in the dict `cache`.
    Modifies the cache dict in place."""
    res = cache.get(arg)
    if arg in cache:
        return cache[arg]
    else:
        res = f(arg)
        cache[arg] = res
        return res

### IO functions that find elf dependencies

_field_matcher = re.compile(b"  ([A-Z0-9_]+) +(.*)$")

def read_dynamic_fields(elf_path):
    """Read the dynamic header fields from an elf binary

    Args:
      elf_path: path to the elf binary (either absolute or relative to pwd)

    Returns:
      a list [(field_key, field_value)] where field_keys could appear multiple
      times (for example there's usually more than one NEEDED field).
    """
    res = subprocess.check_output([
        # force locale to C for stable output
        "env", "LC_ALL=C",
        "objdump",
        # specifying the section brings execution time down from 150ms to 10ms
        "--section=.dynamic",
        "--all-headers",
        elf_path
    ])
    to_end = res.split(b"Dynamic Section:\n")[1]
    # to first empty line
    dyn_section = to_end[: 1 + to_end.find(b"\n\n")]
    def read_dynamic_field(s):
        """return (field_key, field_value)"""
        return _field_matcher.match(s).groups()
    return list(map(read_dynamic_field, dyn_section.splitlines(True)))

def __query_dynamic_fields(df, key):
    """takes a list of dynamic field tuples (key and value),
    where keys can appear multiple times, and returns a list of all
    values with the given key (in stable order)."""
    return [v for k, v in df if k == key]

def parse_runpath_dirs(elf_path, elf_dynamic_fields):
    """Parse a RUNPATH entry from an elf header bytestring.

    Returns:
      { path: unmodified string from DT_RUNPATH
      , absolute_path: fully normalized, absolute path to dir }
    """
    fields = __query_dynamic_fields(elf_dynamic_fields, b"RUNPATH")
    if fields == []:
        return []
    assert len(fields) == 1
    val = fields[0]
    origin = os.path.dirname(elf_path)
    return [{ 'path': path,
              'absolute_path': os.path.abspath(path.replace("$ORIGIN", origin)) }
            for path in val.decode().strip(":").split(":")
            if path != ""]

def parse_needed(elf_dynamic_fields):
    """Returns the list of DT_NEEDED entries for elf"""
    return [n.decode() for n in __query_dynamic_fields(elf_dynamic_fields, b"NEEDED")]


### Main utility

# cannot find dependency
LDD_MISSING = "MISSING"
# don't know how to search for dependency
LDD_UNKNOWN = "DUNNO"
# list of all errors for easy branching
LDD_ERRORS = [ LDD_MISSING, LDD_UNKNOWN ]

def _ldd(elf_cache, f, elf_path):
    """Same as `ldd` (below), except for an additional `elf_cache` argument,
    which is a dict needed for memoizing elf files that were already read.
    This is done because the elf reading operation is quite expensive
    and many files are referenced multiple times (e.g. glib.so)."""

    def search(rdirs, elf_libname):
        """search for elf_libname in runfile dirs
        and return either the name or missing"""
        res = LDD_MISSING
        for rdir in rdirs:
            potential_path = os.path.join(rdir['absolute_path'], elf_libname)
            if os.path.exists(potential_path):
                res = {
                    'item': potential_path,
                    'found_in': rdir,
                }
                break
        return res

    def recurse(search_res):
        """Unfold the subtree of ELF dependencies for a `search` result"""
        if search_res == LDD_MISSING:
            return LDD_MISSING
        else:
            # we keep all other fields in search_res the same,
            # just item is the one that does the recursion.
            # This is the part that would normally be done by fmap.
            search_res['item'] = _ldd(elf_cache, f, search_res['item'])
            return search_res

    # (GNU) ld.so resolves any symlinks before searching for dependencies
    elf_realpath = os.path.realpath(elf_path)

    # memoized uses the cache to not repeat the I/O action
    # for the same elf files (same path)
    dyn_fields = memoized(
        elf_cache, read_dynamic_fields, elf_realpath
    )
    rdirs = parse_runpath_dirs(elf_realpath, dyn_fields)
    all_needed = parse_needed(dyn_fields)

    # if there's no runpath dirs we don't know where to search
    if rdirs == []:
        needed = list_to_dict(const(LDD_UNKNOWN), all_needed)
    else:
        needed = list_to_dict(
            lambda name: recurse(search(rdirs, name)),
            all_needed
        )

    result = {
        'runpath_dirs': rdirs,
        'needed': needed
    }
    # Here, f is applied to the result of the previous level of recursion
    return f(result)


def ldd(f, elf_path):
    """follows DT_NEEDED ELF headers for elf by searching the through DT_RUNPATH.

    DependencyInfo :
    { needed : dict(string, union(
        LDD_MISSING, LDD_UNKNOWN,
        {
            # the needed dependency
            item : a,
            # where the dependency was found in
            found_in : RunpathDir
        }))
    # all runpath directories that were searched
    , runpath_dirs : [ RunpathDir ] }

    Args:
        f: DependencyInfo -> a
        modifies the results of each level
        elf_path: path to ELF file, either absolute or relative to current working dir

    Returns: a
    """
    elf_cache = {}
    return _ldd(elf_cache, f, elf_path)


### Functions to pass to ldd

# Only use the current layer

def remove_matching_needed(d, re_matcher_absolute_path=None, re_matcher_path=None):
    """Destructively removes needed values from d['needed']
    if they match the given regex matcher.
    Doesn't remove LDD_ERRORS."""
    def pred(v):
        """return true if match"""
        if v in LDD_ERRORS:
            return False
        found_in = v['found_in']
        abs_match = re_matcher_absolute_path.match(found_in['absolute_path']) \
                    if re_matcher_absolute_path else False
        match = re_matcher_path.match(found_in['path']) \
                    if re_matcher_path else False
        if abs_match or match:
            return True
    d['needed'] = {
        k: v for k, v in d['needed'].items()
        if not pred(v)
    }

def remove_matching_runpaths(d, re_matcher):
    """Destructively removes runpaths from d['runpath_dirs']
    if they match the given regex matcher."""
    d['runpath_dirs'] = [
        runp for runp in d['runpath_dirs']
        if not re_matcher.match(runp['absolute_path'])
    ]
    return d

def non_existing_runpaths(d):
    """Return a list of runpaths_dirs that do not exist in the file system."""
    return [
        runp for runp in d['runpath_dirs']
        if not os.path.exists(runp['absolute_path'])
    ]

def unused_runpaths(d):
    """Return a list of runpath_dirs that were not used to find NEEDED dependencies."""
    used = set()
    for k, v in d['needed'].items():
        if not v in LDD_ERRORS:
            used.add(v['found_in']['absolute_path'])
    return [
        u for u in d['runpath_dirs']
        if u['absolute_path'] not in used
    ]

# Also use the results of sub-layers

def collect_unused_runpaths(d):
    """This is like `unused_runpaths`, but it creates a deduplicated list of all unused runpaths
    for its dependencies instead of just returning them for the current layer.

    Returns:
      a dict of two fields;
      `mine` contains the unused dependencies of the current binary under scrutiny
      `others` contains a flat dict of all .sos with unused runpath entries and a list of them for each .so
    """
    used = set()
    given = set(r['absolute_path'] for r in d['runpath_dirs'])
    prev = {}
    # TODO: use `unused_runpaths` here
    for k, v in d['needed'].items():
        if not v in LDD_ERRORS:
            used.add(v['found_in']['absolute_path'])
            prev[k] = v['item']
    unused = [
        u for u in given.difference(used)
        # leave out nix storepaths
        if not u.startswith("/nix/store")
    ]

    # Each layer doesn't know about their own name
    # So we return a list of unused for this layer ('mine')
    # and a dict of all previeous layers combined (name to list)
    def combine_unused(deps):
        res = {}
        for name, dep in deps.items():
            res.update(dep['others'])
            res[name] = dep['mine']
        return res

    return {
        'mine': unused,
        'others': combine_unused(prev),
    }