Refactored project structure; Caching util; process_clang_output.py provides Python API now

2022-07-20 13:35:06 +02:00 · 2022-07-20 13:35:06 +02:00 · dc689174c0
commit dc689174c0
parent 71fe4626f7
10 changed files with 906 additions and 722 deletions
--- a/annotate_interactively.py
+++ b/annotate_interactively.py
@ -1,439 +0,0 @@
-import functools
-import json
-import os
-import pickle
-import re
-from dataclasses import dataclass
-from typing import Tuple, List, Literal, Iterable
-
-import numpy as np
-import pandas as pd
-import termcolor
-
-IN_DIR = "/home/max/Projects/llvm-project/clang-tools-extra/ros2-internal-dependency-checker/output"
-SRC_DIR = "/home/max/Projects/autoware/src"
-
-OUT_NAME = "clang_objects.pkl"
-
-
-def SRC_FILE_NAME(in_file_name: str):
-    return os.path.join(SRC_DIR, in_file_name.replace("-", "/").replace(".json", ".cpp"))
-
-
-ignored_idfs = set()
-
-
-class SetEncoder(json.JSONEncoder):
-    def default(self, o):
-        if isinstance(o, set):
-            return list(o)
-        match o:
-            case set():
-                return list(o)
-            case list() | dict() | int() | float() | str():
-                return json.JSONEncoder.default(self, o)
-            case np.int64:
-                return json.JSONEncoder.default(self, int(o))
-
-        return json.JSONEncoder.default(self, o)
-
-
-def fuse_fields(f1, f2):
-    if f1 is None:
-        return f2
-
-    if f2 is None:
-        return f1
-
-    if f1 == f2:
-        return f1
-
-    raise ValueError(f"Inconsistent fields {f1=} and {f2=} cannot be fused")
-
-
-def fuse_objects(o1, o2):
-    field_names = o1.__dataclass_fields__.keys()
-    for f in field_names:
-        setattr(o1, f, fuse_fields(getattr(o1, f), getattr(o2, f)))
-    return o1
-
-
-@dataclass
-class SourceRange:
-    start_file: str
-    start_line: int | None
-    start_col: int | None
-
-    end_file: str
-    end_line: int | None
-    end_col: int | None
-
-    def __init__(self, json_obj):
-        begin = json_obj["begin"].split(":")
-        end = json_obj["end"].split(":")
-
-        self.start_file = os.path.realpath(begin[0])
-        self.start_line = int(begin[1]) if len(begin) > 1 else None
-        self.start_col = int(begin[2].split(" ")[0]) if len(begin) > 2 else None
-
-        self.end_file = os.path.realpath(end[0])
-        self.end_line = int(end[1]) if len(end) > 1 else None
-        self.end_col = int(end[2].split(" ")[0]) if len(end) > 2 else None
-
-    def __hash__(self):
-        return hash((self.start_file, self.start_line, self.start_col,
-                     self.end_file, self.end_line, self.end_col))
-
-
-@dataclass
-class Node:
-    id: int
-    qualified_name: str
-    source_range: 'SourceRange'
-    field_ids: List[int] | None
-    method_ids: List[int] | None
-
-    def __init__(self, json_obj):
-        self.id = json_obj['id']
-        self.qualified_name = json_obj['id']
-        self.source_range = SourceRange(json_obj['source_range'])
-        self.field_ids = list(map(lambda obj: obj['id'], json_obj['fields'])) if 'fields' in json_obj else None
-        self.method_ids = list(map(lambda obj: obj['id'], json_obj['methods'])) if 'methods' in json_obj else None
-
-    def __hash__(self):
-        return hash(self.id)
-
-
-@dataclass
-class Method:
-    id: int
-    qualified_name: str
-    source_range: 'SourceRange'
-    return_type: str | None
-    parameter_types: List[str] | None
-
-    def __init__(self, json_obj):
-        self.id = json_obj['id']
-        self.qualified_name = json_obj['qualified_name']
-        self.source_range = SourceRange(json_obj['source_range'])
-        self.return_type = json_obj['signature']['return_type'] if 'signature' in json_obj else None
-        self.parameter_types = json_obj['signature']['parameter_types'] if 'signature' in json_obj else None
-
-    def __hash__(self):
-        return hash(self.id)
-
-
-@dataclass
-class Field:
-    id: int
-    qualified_name: str
-    source_range: 'SourceRange'
-
-    def __init__(self, json_obj):
-        self.id = json_obj['id']
-        self.qualified_name = json_obj['qualified_name']
-        self.source_range = SourceRange(json_obj['source_range'])
-
-    def __hash__(self):
-        return hash(self.id)
-
-
-@dataclass
-class MemberRef:
-    type: Literal["read", "write", "call", "arg", "pub"] | None
-    member_chain: List[int]
-    method_id: int | None
-    node_id: int | None
-    source_range: 'SourceRange'
-
-    def __init__(self, json_obj):
-        access_type = json_obj['context']['access_type']
-        if access_type == 'none':
-            access_type = None
-        self.type = access_type
-        self.member_chain = list(map(lambda obj: obj['id'], json_obj['member'][::-1]))
-        self.method_id = json_obj['context']['method']['id'] if 'method' in json_obj['context'] else None
-        self.node_id = json_obj['context']['node']['id'] if 'node' in json_obj['context'] else None
-        self.source_range = SourceRange(json_obj['context']['statement']['source_range'])
-
-    def __hash__(self):
-        return self.source_range.__hash__()
-
-
-@dataclass
-class Subscription:
-    topic: str | None
-    callback_id: int | None
-    source_range: 'SourceRange'
-
-    def __init__(self, json_obj):
-        self.topic = json_obj['topic'] if 'topic' in json_obj else None
-        self.callback_id = json_obj['callback']['id'] if 'callback' in json_obj else None
-        self.source_range = SourceRange(json_obj['source_range'])
-
-    def __hash__(self):
-        return self.source_range.__hash__()
-
-
-@dataclass
-class Publisher:
-    topic: str | None
-    member_id: int | None
-    source_range: 'SourceRange'
-
-    def update(self, t2: 'Timer'):
-        return self
-
-    def __init__(self, json_obj):
-        self.topic = json_obj['topic'] if 'topic' in json_obj else None
-        self.member_id = json_obj['member']['id'] if 'member' in json_obj else None
-        self.source_range = SourceRange(json_obj['source_range'])
-
-    def __hash__(self):
-        return self.source_range.__hash__()
-
-
-@dataclass
-class Timer:
-    callback_id: int | None
-    source_range: 'SourceRange'
-
-    def __init__(self, json_obj):
-        self.callback_id = json_obj['callback']['id'] if 'callback' in json_obj else None
-        self.source_range = SourceRange(json_obj['source_range'])
-
-    def __hash__(self):
-        return self.source_range.__hash__()
-
-
-def find_data_deps(nodes: Iterable[Node], pubs: Iterable[Publisher], subs: Iterable[Subscription],
-                   timers: Iterable[Timer], fields, methods, accesses: Iterable[MemberRef]):
-    writes = set()
-    reads = set()
-    publications = set()
-
-    for member_ref in accesses:
-        member_id = member_ref.member_chain[0] if member_ref.member_chain else None
-        if member_id is None:
-            print(f"[WARN ] MemberRef without any members in chain @ {member_ref.source_range}")
-            continue
-
-        dep_tuple = (member_ref.method_id, member_id)
-
-        match member_ref.type:
-            case "write":
-                writes.add(dep_tuple)
-            case "read":
-                reads.add(dep_tuple)
-            case "call" | "arg":
-                writes.add(dep_tuple)
-                reads.add(dep_tuple)
-            case "pub":
-                publications.add(dep_tuple)
-
-    reads = pd.DataFrame.from_records(list(reads), columns=['method_id', 'member_id'])
-    writes = pd.DataFrame.from_records(list(writes), columns=['method_id', 'member_id'])
-    pub_dict = {method: set() for method, _ in publications}
-    for method, member in publications:
-        pub_dict[method].add(member)
-
-    deps = {}
-
-    for reading_method in reads["method_id"].unique().tolist():
-        deps[reading_method] = set()
-
-        read_members = reads[reads['method_id'] == reading_method]["member_id"].unique().tolist()
-
-        for read_member in read_members:
-            writing_methods = writes[writes['member_id'] == read_member]['method_id'].unique().tolist()
-            deps[reading_method].update(writing_methods)
-
-        deps[reading_method].discard(reading_method)  # Remove reflexive dependencies
-
-    return publications, deps
-
-
-def dedup(elems):
-    hash_map = {}
-
-    for e in elems:
-        if e.__hash__() not in hash_map:
-            hash_map[e.__hash__()] = []
-        hash_map[e.__hash__()].append(e)
-
-    ret_list = []
-    for hash, elems in hash_map.items():
-        if len(elems) == 1:
-            ret_list += elems
-            continue
-
-        elem = functools.reduce(fuse_objects, elems[1:], elems[0])
-        ret_list.append(elem)
-        print(f"Fused {len(elems)} {type(elem)}s")
-
-    return ret_list
-
-
-def definitions_from_json(cb_dict):
-    nodes = []
-    pubs = []
-    subs = []
-    timers = []
-    accesses = []
-    fields = []
-    methods = []
-
-    if "nodes" in cb_dict:
-        for node in cb_dict["nodes"]:
-            nodes.append(Node(node))
-            for field in node["fields"]:
-                fields.append(Field(field))
-            for method in node["methods"]:
-                methods.append(Method(method))
-
-    if "publishers" in cb_dict:
-        for publisher in cb_dict["publishers"]:
-            pubs.append(Publisher(publisher))
-
-    if "subscriptions" in cb_dict:
-        for subscription in cb_dict["subscriptions"]:
-            subs.append(Subscription(subscription))
-
-    if "timers" in cb_dict:
-        for timer in cb_dict["timers"]:
-            timers.append(Timer(timer))
-
-    if "accesses" in cb_dict:
-        for access_type in cb_dict["accesses"]:
-            for access in cb_dict["accesses"][access_type]:
-                accesses.append(MemberRef(access))
-
-    nodes = dedup(nodes)
-    pubs = dedup(pubs)
-    subs = dedup(subs)
-    timers = dedup(timers)
-    fields = dedup(fields)
-    methods = dedup(methods)
-
-    return nodes, pubs, subs, timers, fields, methods, accesses
-
-
-def highlight(substr: str, text: str):
-    regex = r"(?<=\W)({substr})(?=\W)|^({substr})$"
-    return re.sub(regex.format(substr=substr), termcolor.colored(r"\1\2", 'magenta', attrs=['bold']), text)
-
-
-def prompt_user(file: str, cb: str, idf: str, text: str) -> Tuple[str, bool, bool]:
-    print('\n' * 5)
-    print(f"{file.rstrip('.cpp').rstrip('.hpp')}\n->{cb}:")
-    print(highlight(idf.split('::')[-1], text))
-    answer = input(f"{highlight(idf, idf)}\n"
-                   f"write (w), read (r), both (rw), ignore future (i) exit and save (q), undo (z), skip (Enter): ")
-    if answer not in ["", "r", "w", "rw", "q", "z", "i"]:
-        print(f"Invalid answer '{answer}', try again.")
-        answer = prompt_user(file, cb, idf, text)
-
-    if answer == 'i':
-        ignored_idfs.add(idf)
-    elif any(x in answer for x in ['r', 'w']):
-        ignored_idfs.discard(idf)
-
-    return answer, answer == "q", answer == "z"
-
-
-def main(nodes, cbs, fields, methods):
-    open_files = {}
-    cb_rw_dict = {}
-
-    jobs = []
-
-    for cb_id, cb_dict in cbs.items():
-        cb_rw_dict[cb_dict['qualified_name']] = {'reads': set(), 'writes': set()}
-        for ref_dict in cb_dict['member_refs']:
-            if ref_dict['file'] not in open_files:
-                with open(ref_dict['file'], 'r') as f:
-                    open_files[ref_dict['file']] = f.readlines()
-
-            ln = ref_dict['start_line'] - 1
-            text = open_files[ref_dict['file']]
-            line = termcolor.colored(text[ln], None, "on_cyan")
-            lines = [*text[ln - 3:ln], line, *text[ln + 1:ln + 4]]
-            text = ''.join(lines)
-            jobs.append((ref_dict['file'], cb_dict['qualified_name'], ref_dict['qualified_name'], text))
-
-    i = 0
-    do_undo = False
-    while i < len(jobs):
-        file, cb, idf, text = jobs[i]
-
-        if do_undo:
-            ignored_idfs.discard(idf)
-            cb_rw_dict[cb]['reads'].discard(idf)
-            cb_rw_dict[cb]['writes'].discard(idf)
-            do_undo = False
-
-        if idf in ignored_idfs:
-            print("Ignoring", idf)
-            i += 1
-            continue
-
-        if idf in cb_rw_dict[cb]['reads'] and idf in cb_rw_dict[cb]['writes']:
-            print(f"{idf} is already written to and read from in {cb}, skipping.")
-            i += 1
-            continue
-
-        classification, answ_quit, answ_undo = prompt_user(file, cb, idf, text)
-
-        if answ_quit:
-            del cb_rw_dict[file][cb]
-            break
-        elif answ_undo:
-            i -= 1
-            do_undo = True
-            continue
-
-        if 'r' in classification:
-            cb_rw_dict[cb]['reads'].add(idf)
-        if 'w' in classification:
-            cb_rw_dict[cb]['writes'].add(idf)
-        if not any(x in classification for x in ['r', 'w']):
-            print(f"Ignoring occurences of {idf} in cb.")
-
-        i += 1
-
-    with open("deps.json", "w") as f:
-        json.dump(cb_rw_dict, f, cls=SetEncoder)
-
-    print("Done.")
-
-
-if __name__ == "__main__":
-    out_dict = {}
-
-    for filename in os.listdir(IN_DIR):
-        source_filename = SRC_FILE_NAME(filename)
-        print(f"Processing {source_filename}")
-        with open(os.path.join(IN_DIR, filename), "r") as f:
-            cb_dict = json.load(f)
-            if cb_dict is None:
-                print(f"  [WARN ] Empty tool output detected in {filename}")
-                continue
-            definitions = definitions_from_json(cb_dict)
-            deps, publications = find_data_deps(*definitions)
-
-            (nodes, pubs, subs, timers, fields, methods, accesses) = definitions
-            out_dict[source_filename] = {
-                    "dependencies": deps,
-                    "publications": publications,
-                    "nodes": nodes,
-                    "publishers": pubs,
-                    "subscriptions": subs,
-                    "timers": timers,
-                    "fields": fields,
-                    "methods": methods
-            }
-
-    with open(OUT_NAME, "wb") as f:
-        pickle.dump(out_dict, f)
-
-    print("Done.")