rippled/.github/scripts/levelization/generate.py at develop · gregtatcam/rippled · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env python3

"""
Usage: generate.py
This script takes no parameters, and can be called from any directory in the file system.
"""

import os
import re
import subprocess
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple, Set, Optional

# Compile regex patterns once at module level
INCLUDE_PATTERN = re.compile(r"^\s*#include.*/.*\.h")
INCLUDE_PATH_PATTERN = re.compile(r'[<"]([^>"]+)[>"]')


def dictionary_sort_key(s: str) -> str:
    """
    Create a sort key that mimics 'sort -d' (dictionary order).
    Dictionary order only considers blanks and alphanumeric characters.
    This means punctuation like '.' is ignored during sorting.
    """
    # Keep only alphanumeric characters and spaces
    return "".join(c for c in s if c.isalnum() or c.isspace())


def get_level(file_path: str) -> str:
    """
    Extract the level from a file path (second and third directory components).
    Equivalent to bash: cut -d/ -f 2,3

    Examples:
        src/xrpld/app/main.cpp -> xrpld.app
        src/libxrpl/protocol/STObject.cpp -> libxrpl.protocol
        include/xrpl/basics/base_uint.h -> xrpl.basics
    """
    parts = file_path.split("/")

    # Get fields 2 and 3 (indices 1 and 2 in 0-based indexing)
    if len(parts) >= 3:
        level = f"{parts[1]}/{parts[2]}"
    elif len(parts) >= 2:
        level = f"{parts[1]}/toplevel"
    else:
        level = file_path

    # If the "level" indicates a file, cut off the filename
    if "." in level.split("/")[-1]:  # Avoid Path object creation
        # Use the "toplevel" label as a workaround for `sort`
        # inconsistencies between different utility versions
        level = level.rsplit("/", 1)[0] + "/toplevel"

    return level.replace("/", ".")


def extract_include_level(include_line: str) -> Optional[str]:
    """
    Extract the include path from an #include directive.
    Gets the first two directory components from the include path.
    Equivalent to bash: cut -d/ -f 1,2

    Examples:
        #include <xrpl/basics/base_uint.h> -> xrpl.basics
        #include "xrpld/app/main/Application.h" -> xrpld.app
    """
    # Remove everything before the quote or angle bracket
    match = INCLUDE_PATH_PATTERN.search(include_line)
    if not match:
        return None

    include_path = match.group(1)
    parts = include_path.split("/")

    # Get first two fields (indices 0 and 1)
    if len(parts) >= 2:
        include_level = f"{parts[0]}/{parts[1]}"
    else:
        include_level = include_path

    # If the "includelevel" indicates a file, cut off the filename
    if "." in include_level.split("/")[-1]:  # Avoid Path object creation
        include_level = include_level.rsplit("/", 1)[0] + "/toplevel"

    return include_level.replace("/", ".")


def find_repository_directories(
    start_path: Path, depth_limit: int = 10
) -> Tuple[Path, List[Path]]:
    """
    Find the repository root by looking for src or include folders.
    Walks up the directory tree from the start path.
    """
    current = start_path.resolve()

    # Walk up the directory tree
    for _ in range(depth_limit):  # Limit search depth to prevent infinite loops
        src_path = current / "src"
        include_path = current / "include"
        # Check if this directory has src or include folders
        has_src = src_path.exists()
        has_include = include_path.exists()

        if has_src or has_include:
            return current, [src_path, include_path]

        # Move up one level
        parent = current.parent
        if parent == current:  # Reached filesystem root
            break
        current = parent

    # If we couldn't find it, raise an error
    raise RuntimeError(
        "Could not find repository root. "
        "Expected to find a directory containing 'src' and/or 'include' folders."
    )


def main():
    # Change to the script's directory
    script_dir = Path(__file__).parent.resolve()
    os.chdir(script_dir)

    # Clean up and create results directory.
    results_dir = script_dir / "results"
    if results_dir.exists():
        import shutil

        shutil.rmtree(results_dir)
    results_dir.mkdir()

    # Find the repository root by searching for src and include directories.
    try:
        repo_root, scan_dirs = find_repository_directories(script_dir)

        print(f"Found repository root: {repo_root}")
        print(f"Scanning directories:")
        for scan_dir in scan_dirs:
            print(f"  - {scan_dir.relative_to(repo_root)}")
    except RuntimeError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

    print("\nScanning for raw includes...")
    # Find all #include directives
    raw_includes: List[Tuple[str, str]] = []
    rawincludes_file = results_dir / "rawincludes.txt"

    # Write to file as we go to avoid storing everything in memory.
    with open(rawincludes_file, "w", buffering=8192) as raw_f:
        for dir_path in scan_dirs:
            print(f"  Scanning {dir_path.relative_to(repo_root)}...")

            for file_path in dir_path.rglob("*"):
                if not file_path.is_file():
                    continue

                try:
                    rel_path_str = str(file_path.relative_to(repo_root))

                    # Read file with a large buffer for performance.
                    with open(
                        file_path,
                        "r",
                        encoding="utf-8",
                        errors="ignore",
                        buffering=8192,
                    ) as f:
                        for line in f:
                            # Quick check before regex
                            if "#include" not in line or "boost" in line:
                                continue

                            if INCLUDE_PATTERN.match(line):
                                line_stripped = line.strip()
                                entry = f"{rel_path_str}:{line_stripped}\n"
                                print(entry, end="")
                                raw_f.write(entry)
                                raw_includes.append((rel_path_str, line_stripped))
                except Exception as e:
                    print(f"Error reading {file_path}: {e}", file=sys.stderr)

    # Build levelization paths and count directly (no need to sort first).
    print("Build levelization paths")
    path_counts: Dict[Tuple[str, str], int] = defaultdict(int)

    for file_path, include_line in raw_includes:
        include_level = extract_include_level(include_line)
        if not include_level:
            continue

        level = get_level(file_path)
        if level != include_level:
            path_counts[(level, include_level)] += 1

    # Sort and deduplicate paths (using dictionary order like bash 'sort -d').
    print("Sort and deduplicate paths")

    paths_file = results_dir / "paths.txt"
    with open(paths_file, "w") as f:
        # Sort using dictionary order: only alphanumeric and spaces matter
        sorted_items = sorted(
            path_counts.items(),
            key=lambda x: (dictionary_sort_key(x[0][0]), dictionary_sort_key(x[0][1])),
        )
        for (level, include_level), count in sorted_items:
            line = f"{count:7} {level} {include_level}\n"
            print(line.rstrip())
            f.write(line)

    # Split into flat-file database
    print("Split into flat-file database")
    includes_dir = results_dir / "includes"
    included_by_dir = results_dir / "included_by"
    includes_dir.mkdir()
    included_by_dir.mkdir()

    # Batch writes by grouping data first to avoid repeated file opens.
    includes_data: Dict[str, List[Tuple[str, int]]] = defaultdict(list)
    included_by_data: Dict[str, List[Tuple[str, int]]] = defaultdict(list)

    # Process in sorted order to match bash script behaviour (dictionary order).
    sorted_items = sorted(
        path_counts.items(),
        key=lambda x: (dictionary_sort_key(x[0][0]), dictionary_sort_key(x[0][1])),
    )
    for (level, include_level), count in sorted_items:
        includes_data[level].append((include_level, count))
        included_by_data[include_level].append((level, count))

    # Write all includes files in sorted order (dictionary order).
    for level in sorted(includes_data.keys(), key=dictionary_sort_key):
        entries = includes_data[level]
        with open(includes_dir / level, "w") as f:
            for include_level, count in entries:
                line = f"{include_level} {count}\n"
                print(line.rstrip())
                f.write(line)

    # Write all included_by files in sorted order (dictionary order).
    for include_level in sorted(included_by_data.keys(), key=dictionary_sort_key):
        entries = included_by_data[include_level]
        with open(included_by_dir / include_level, "w") as f:
            for level, count in entries:
                line = f"{level} {count}\n"
                print(line.rstrip())
                f.write(line)

    # Search for loops
    print("Search for loops")
    loops_file = results_dir / "loops.txt"
    ordering_file = results_dir / "ordering.txt"

    loops_found: Set[Tuple[str, str]] = set()

    # Pre-load all include files into memory to avoid repeated I/O.
    # This is the biggest optimisation - we were reading files repeatedly in nested loops.
    # Use list of tuples to preserve file order.
    includes_cache: Dict[str, List[Tuple[str, int]]] = {}
    includes_lookup: Dict[str, Dict[str, int]] = {}  # For fast lookup

    # Note: bash script uses 'for source in *' which uses standard glob sorting,
    # NOT dictionary order. So we use standard sorted() here, not dictionary_sort_key.
    for include_file in sorted(includes_dir.iterdir(), key=lambda p: p.name):
        if not include_file.is_file():
            continue

        includes_cache[include_file.name] = []
        includes_lookup[include_file.name] = {}
        with open(include_file, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 2:
                    include_name = parts[0]
                    include_count = int(parts[1])
                    includes_cache[include_file.name].append(
                        (include_name, include_count)
                    )
                    includes_lookup[include_file.name][include_name] = include_count

    with open(loops_file, "w", buffering=8192) as loops_f, open(
        ordering_file, "w", buffering=8192
    ) as ordering_f:

        # Use standard sorting to match bash glob expansion 'for source in *'.
        for source in sorted(includes_cache.keys()):
            source_includes = includes_cache[source]

            for include, include_freq in source_includes:
                # Check if include file exists and references source
                if include not in includes_lookup:
                    continue

                source_freq = includes_lookup[include].get(source)

                if source_freq is not None:
                    # Found a loop
                    loop_key = tuple(sorted([source, include]))
                    if loop_key in loops_found:
                        continue
                    loops_found.add(loop_key)

                    loops_f.write(f"Loop: {source} {include}\n")

                    # If the counts are close, indicate that the two modules are
                    # on the same level, though they shouldn't be.
                    diff = include_freq - source_freq
                    if diff > 3:
                        loops_f.write(f"  {source} > {include}\n\n")
                    elif diff < -3:
                        loops_f.write(f"  {include} > {source}\n\n")
                    elif source_freq == include_freq:
                        loops_f.write(f"  {include} == {source}\n\n")
                    else:
                        loops_f.write(f"  {include} ~= {source}\n\n")
                else:
                    ordering_f.write(f"{source} > {include}\n")

    # Print results
    print("\nOrdering:")
    with open(ordering_file, "r") as f:
        print(f.read(), end="")

    print("\nLoops:")
    with open(loops_file, "r") as f:
        print(f.read(), end="")


if __name__ == "__main__":
    main()