Refactored project structure; Caching util; process_clang_output.py provides Python API now

2022-07-20 13:35:06 +02:00 · 2022-07-20 13:35:06 +02:00 · dc689174c0
commit dc689174c0
parent 71fe4626f7
10 changed files with 906 additions and 722 deletions
--- a/clang_interop/init.py
+++ b/clang_interop/init.py
--- a/clang_interop/process_clang_output.py
+++ b/clang_interop/process_clang_output.py
@ -0,0 +1,296 @@
+import functools
+import json
+import os
+import pickle
+import re
+from typing import Tuple, Iterable
+
+import numpy as np
+import pandas as pd
+import termcolor
+
+from clang_interop.types import ClNode, ClField, ClTimer, ClMethod, ClPublisher, ClSubscription, ClMemberRef, ClContext, \
+    ClTranslationUnit
+
+IN_DIR = "/home/max/Projects/llvm-project/clang-tools-extra/ros2-internal-dependency-checker/output"
+SRC_DIR = "/home/max/Projects/autoware/src"
+
+OUT_NAME = "clang_objects.pkl"
+
+
+def SRC_FILE_NAME(in_file_name: str):
+    return os.path.join(SRC_DIR, in_file_name.replace("-", "/").replace(".json", ".cpp"))
+
+
+ignored_idfs = set()
+
+
+class SetEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, set):
+            return list(o)
+        match o:
+            case set():
+                return list(o)
+            case list() | dict() | int() | float() | str():
+                return json.JSONEncoder.default(self, o)
+            case np.int64:
+                return json.JSONEncoder.default(self, int(o))
+
+        return json.JSONEncoder.default(self, o)
+
+
+def fuse_fields(f1, f2):
+    if f1 is None:
+        return f2
+
+    if f2 is None:
+        return f1
+
+    if f1 == f2:
+        return f1
+
+    raise ValueError(f"Inconsistent fields {f1=} and {f2=} cannot be fused")
+
+
+def fuse_objects(o1, o2):
+    field_names = o1.__dataclass_fields__.keys()
+    for f in field_names:
+        setattr(o1, f, fuse_fields(getattr(o1, f), getattr(o2, f)))
+    return o1
+
+
+def find_data_deps(accesses: Iterable[ClMemberRef]):
+    writes = set()
+    reads = set()
+    publications = {}
+
+    for member_ref in accesses:
+        member_id = member_ref.member_chain[0] if member_ref.member_chain else None
+        if member_id is None:
+            print(f"[WARN ] MemberRef without any members in chain @ {member_ref.source_range}")
+            continue
+
+        dep_tuple = (member_ref.method_id, member_id)
+
+        match member_ref.type:
+            case "write":
+                writes.add(dep_tuple)
+            case "read":
+                reads.add(dep_tuple)
+            case "call" | "arg":
+                writes.add(dep_tuple)
+                reads.add(dep_tuple)
+            case "pub":
+                if member_ref.method_id not in publications:
+                    publications[member_ref.method_id] = set()
+                publications[member_ref.method_id].add(member_id)
+
+    reads = pd.DataFrame.from_records(list(reads), columns=['method_id', 'member_id'])
+    writes = pd.DataFrame.from_records(list(writes), columns=['method_id', 'member_id'])
+    pub_dict = {method: set() for method, _ in publications}
+    for method, member in publications:
+        pub_dict[method].add(member)
+
+    deps = {}
+
+    for reading_method in reads["method_id"].unique().tolist():
+        deps[reading_method] = set()
+
+        read_members = reads[reads['method_id'] == reading_method]["member_id"].unique().tolist()
+
+        for read_member in read_members:
+            writing_methods = writes[writes['member_id'] == read_member]['method_id'].unique().tolist()
+            deps[reading_method].update(writing_methods)
+
+        deps[reading_method].discard(reading_method)  # Remove reflexive dependencies
+
+    return deps, publications
+
+
+def dedup(elems):
+    hash_map = {}
+
+    for e in elems:
+        if e.__hash__() not in hash_map:
+            hash_map[e.__hash__()] = []
+        hash_map[e.__hash__()].append(e)
+
+    ret_list = []
+    for hash, elems in hash_map.items():
+        if len(elems) == 1:
+            ret_list += elems
+            continue
+
+        elem = functools.reduce(fuse_objects, elems[1:], elems[0])
+        ret_list.append(elem)
+        print(f"Fused {len(elems)} {type(elem)}s")
+
+    return ret_list
+
+
+def dictify(elems, key='id'):
+    return {getattr(e, key): e for e in elems}
+
+
+def definitions_from_json(cb_dict):
+    nodes = []
+    pubs = []
+    subs = []
+    timers = []
+    accesses = []
+    fields = []
+    methods = []
+
+    if "nodes" in cb_dict:
+        for node in cb_dict["nodes"]:
+            nodes.append(ClNode(node))
+            for field in node["fields"]:
+                fields.append(ClField(field))
+            for method in node["methods"]:
+                methods.append(ClMethod(method))
+
+    if "publishers" in cb_dict:
+        for publisher in cb_dict["publishers"]:
+            pubs.append(ClPublisher(publisher))
+
+    if "subscriptions" in cb_dict:
+        for subscription in cb_dict["subscriptions"]:
+            subs.append(ClSubscription(subscription))
+
+    if "timers" in cb_dict:
+        for timer in cb_dict["timers"]:
+            timers.append(ClTimer(timer))
+
+    if "accesses" in cb_dict:
+        for access_type in cb_dict["accesses"]:
+            for access in cb_dict["accesses"][access_type]:
+                accesses.append(ClMemberRef(access))
+
+    nodes = dictify(dedup(nodes))
+    pubs = dictify(dedup(pubs), key='member_id')
+    subs = dictify(dedup(subs), key='callback_id')
+    timers = dictify(dedup(timers), key='callback_id')
+    fields = dictify(dedup(fields))
+    methods = dictify(dedup(methods))
+
+    return nodes, pubs, subs, timers, fields, methods, accesses
+
+
+def highlight(substr: str, text: str):
+    regex = r"(?<=\W)({substr})(?=\W)|^({substr})$"
+    return re.sub(regex.format(substr=substr), termcolor.colored(r"\1\2", 'magenta', attrs=['bold']), text)
+
+
+def prompt_user(file: str, cb: str, idf: str, text: str) -> Tuple[str, bool, bool]:
+    print('\n' * 5)
+    print(f"{file.rstrip('.cpp').rstrip('.hpp')}\n->{cb}:")
+    print(highlight(idf.split('::')[-1], text))
+    answer = input(f"{highlight(idf, idf)}\n"
+                   f"write (w), read (r), both (rw), ignore future (i) exit and save (q), undo (z), skip (Enter): ")
+    if answer not in ["", "r", "w", "rw", "q", "z", "i"]:
+        print(f"Invalid answer '{answer}', try again.")
+        answer = prompt_user(file, cb, idf, text)
+
+    if answer == 'i':
+        ignored_idfs.add(idf)
+    elif any(x in answer for x in ['r', 'w']):
+        ignored_idfs.discard(idf)
+
+    return answer, answer == "q", answer == "z"
+
+
+def main(cbs):
+    open_files = {}
+    cb_rw_dict = {}
+
+    jobs = []
+
+    for cb_id, cb_dict in cbs.items():
+        cb_rw_dict[cb_dict['qualified_name']] = {'reads': set(), 'writes': set()}
+        for ref_dict in cb_dict['member_refs']:
+            if ref_dict['file'] not in open_files:
+                with open(ref_dict['file'], 'r') as f:
+                    open_files[ref_dict['file']] = f.readlines()
+
+            ln = ref_dict['start_line'] - 1
+            text = open_files[ref_dict['file']]
+            line = termcolor.colored(text[ln], None, "on_cyan")
+            lines = [*text[ln - 3:ln], line, *text[ln + 1:ln + 4]]
+            text = ''.join(lines)
+            jobs.append((ref_dict['file'], cb_dict['qualified_name'], ref_dict['qualified_name'], text))
+
+    i = 0
+    do_undo = False
+    while i < len(jobs):
+        file, cb, idf, text = jobs[i]
+
+        if do_undo:
+            ignored_idfs.discard(idf)
+            cb_rw_dict[cb]['reads'].discard(idf)
+            cb_rw_dict[cb]['writes'].discard(idf)
+            do_undo = False
+
+        if idf in ignored_idfs:
+            print("Ignoring", idf)
+            i += 1
+            continue
+
+        if idf in cb_rw_dict[cb]['reads'] and idf in cb_rw_dict[cb]['writes']:
+            print(f"{idf} is already written to and read from in {cb}, skipping.")
+            i += 1
+            continue
+
+        classification, answ_quit, answ_undo = prompt_user(file, cb, idf, text)
+
+        if answ_quit:
+            del cb_rw_dict[file][cb]
+            break
+        elif answ_undo:
+            i -= 1
+            do_undo = True
+            continue
+
+        if 'r' in classification:
+            cb_rw_dict[cb]['reads'].add(idf)
+        if 'w' in classification:
+            cb_rw_dict[cb]['writes'].add(idf)
+        if not any(x in classification for x in ['r', 'w']):
+            print(f"Ignoring occurences of {idf} in cb.")
+
+        i += 1
+
+    with open("deps.json", "w") as f:
+        json.dump(cb_rw_dict, f, cls=SetEncoder)
+
+    print("Done.")
+
+
+def process_clang_output(directory=IN_DIR):
+    clang_context = ClContext()
+
+    for filename in os.listdir(IN_DIR):
+        source_filename = SRC_FILE_NAME(filename)
+        print(f"Processing {source_filename}")
+        with open(os.path.join(IN_DIR, filename), "r") as f:
+            cb_dict = json.load(f)
+            if cb_dict is None:
+                print(f"  [WARN ] Empty tool output detected in {filename}")
+                continue
+
+            nodes, pubs, subs, timers, fields, methods, accesses = definitions_from_json(cb_dict)
+            deps, publications = find_data_deps(accesses)
+
+            tu = ClTranslationUnit(deps, publications, nodes, pubs, subs, timers, fields, methods, accesses)
+            clang_context.translation_units[source_filename] = tu
+
+    return clang_context
+
+
+if __name__ == "__main__":
+    clang_context = process_clang_output()
+
+    with open(OUT_NAME, "wb") as f:
+        pickle.dump(clang_context, f)
+
+    print("Done.")
--- a/clang_interop/types.py
+++ b/clang_interop/types.py
@ -0,0 +1,173 @@
+import os
+from dataclasses import dataclass, field
+from typing import List, Literal, Dict, Set
+
+
+@dataclass
+class ClTranslationUnit:
+    dependencies: Dict[int, Set[int]]
+    publications: Dict[int, Set[int]]
+    nodes: Dict[int, 'ClNode']
+    publishers: Dict[int, 'ClPublisher']
+    subscriptions: Dict[int, 'ClSubscription']
+    timers: Dict[int, 'ClTimer']
+    fields: Dict[int, 'ClField']
+    methods: Dict[int, 'ClMethod']
+    accesses: List['ClMemberRef']
+
+
+@dataclass
+class ClContext:
+    translation_units: Dict[str, 'ClTranslationUnit'] = field(default_factory=dict)
+
+
+@dataclass
+class ClSourceRange:
+    start_file: str
+    start_line: int | None
+    start_col: int | None
+
+    end_file: str
+    end_line: int | None
+    end_col: int | None
+
+    def __init__(self, json_obj):
+        begin = json_obj["begin"].split(":")
+        end = json_obj["end"].split(":")
+
+        self.start_file = os.path.realpath(begin[0])
+        self.start_line = int(begin[1]) if len(begin) > 1 else None
+        self.start_col = int(begin[2].split(" ")[0]) if len(begin) > 2 else None
+
+        self.end_file = os.path.realpath(end[0])
+        self.end_line = int(end[1]) if len(end) > 1 else None
+        self.end_col = int(end[2].split(" ")[0]) if len(end) > 2 else None
+
+    def __hash__(self):
+        return hash((self.start_file, self.start_line, self.start_col,
+                     self.end_file, self.end_line, self.end_col))
+
+
+@dataclass
+class ClNode:
+    id: int
+    qualified_name: str
+    source_range: 'ClSourceRange'
+    field_ids: List[int] | None
+    method_ids: List[int] | None
+    ros_name: str | None
+    ros_namespace: str | None
+
+    def __init__(self, json_obj):
+        self.id = json_obj['id']
+        self.qualified_name = json_obj['id']
+        self.source_range = ClSourceRange(json_obj['source_range'])
+        self.field_ids = list(map(lambda obj: obj['id'], json_obj['fields'])) if 'fields' in json_obj else None
+        self.method_ids = list(map(lambda obj: obj['id'], json_obj['methods'])) if 'methods' in json_obj else None
+        self.ros_name = json_obj['ros_name'] if 'ros_name' in json_obj else None
+        self.ros_namespace = json_obj['ros_namespace'] if 'ros_namespace' in json_obj else None
+
+    def __hash__(self):
+        return hash(self.id)
+
+
+@dataclass
+class ClMethod:
+    id: int
+    qualified_name: str
+    source_range: 'ClSourceRange'
+    return_type: str | None
+    parameter_types: List[str] | None
+
+    def __init__(self, json_obj):
+        self.id = json_obj['id']
+        self.qualified_name = json_obj['qualified_name']
+        self.source_range = ClSourceRange(json_obj['source_range'])
+        self.return_type = json_obj['signature']['return_type'] if 'signature' in json_obj else None
+        self.parameter_types = json_obj['signature']['parameter_types'] if 'signature' in json_obj else None
+
+    def __hash__(self):
+        return hash(self.id)
+
+
+@dataclass
+class ClField:
+    id: int
+    qualified_name: str
+    source_range: 'ClSourceRange'
+
+    def __init__(self, json_obj):
+        self.id = json_obj['id']
+        self.qualified_name = json_obj['qualified_name']
+        self.source_range = ClSourceRange(json_obj['source_range'])
+
+    def __hash__(self):
+        return hash(self.id)
+
+
+@dataclass
+class ClMemberRef:
+    type: Literal["read", "write", "call", "arg", "pub"] | None
+    member_chain: List[int]
+    method_id: int | None
+    node_id: int | None
+    source_range: 'ClSourceRange'
+
+    def __init__(self, json_obj):
+        access_type = json_obj['context']['access_type']
+        if access_type == 'none':
+            access_type = None
+        self.type = access_type
+        self.member_chain = list(map(lambda obj: obj['id'], json_obj['member'][::-1]))
+        self.method_id = json_obj['context']['method']['id'] if 'method' in json_obj['context'] else None
+        self.node_id = json_obj['context']['node']['id'] if 'node' in json_obj['context'] else None
+        self.source_range = ClSourceRange(json_obj['context']['statement']['source_range'])
+
+    def __hash__(self):
+        return self.source_range.__hash__()
+
+
+@dataclass
+class ClSubscription:
+    topic: str | None
+    callback_id: int | None
+    source_range: 'ClSourceRange'
+
+    def __init__(self, json_obj):
+        self.topic = json_obj['topic'] if 'topic' in json_obj else None
+        self.callback_id = json_obj['callback']['id'] if 'callback' in json_obj else None
+        self.source_range = ClSourceRange(json_obj['source_range'])
+
+    def __hash__(self):
+        return self.source_range.__hash__()
+
+
+@dataclass
+class ClPublisher:
+    topic: str | None
+    member_id: int | None
+    source_range: 'ClSourceRange'
+
+    def update(self, t2: 'ClTimer'):
+        return self
+
+    def __init__(self, json_obj):
+        self.topic = json_obj['topic'] if 'topic' in json_obj else None
+        self.member_id = json_obj['member']['id'] if 'member' in json_obj else None
+        self.source_range = ClSourceRange(json_obj['source_range'])
+
+    def __hash__(self):
+        return self.source_range.__hash__()
+
+
+@dataclass
+class ClTimer:
+    callback_id: int | None
+    source_range: 'ClSourceRange'
+
+    def __init__(self, json_obj):
+        self.callback_id = json_obj['callback']['id'] if 'callback' in json_obj else None
+        self.source_range = ClSourceRange(json_obj['source_range'])
+
+    def __hash__(self):
+        return self.source_range.__hash__()