Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions analyzer/codechecker_analyzer/cachedb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# -------------------------------------------------------------------------
#
# Part of the CodeChecker project, under the Apache License v2.0 with
# LLVM Exceptions. See LICENSE for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# -------------------------------------------------------------------------

import sqlite3
import itertools
import os
from typing import List


class CacheDB:
"""
SQLite database located in the report directory,
designed to speed up the parsing process.
"""

__sqlitedb_path: str
__con: sqlite3.Connection
__cur: sqlite3.Cursor

def __init__(self, report_dir: str, clean: bool = False):
"""
Initiates the cache database and creates the necessary tables.

Args:
report_dir (str): path to the report directory
clean (bool): If set to True, the previous database
will be dropped and a new one is created.
"""
self.__sqlitedb_path = os.path.join(report_dir, "cache.sqlite")

if clean and os.path.exists(self.__sqlitedb_path):
os.remove(self.__sqlitedb_path)

self.__create_connection()

def __create_connection(self):
self.__con = sqlite3.connect(self.__sqlitedb_path)
self.__cur = self.__con.cursor()
self.__create_tables()

def close_connection(self):
"""
Closes the connection to the cache database and writes
changes to the disk.
"""
self.__con.close()

def __table_exists(self, name: str) -> bool:
res = self.__cur.execute("SELECT name FROM sqlite_master WHERE name=?",
[name])
return res.fetchone() is not None

def __create_tables(self):
if not self.__table_exists("plist_lookup"):
self.__cur.execute("CREATE TABLE plist_lookup"
"(plist TEXT, source TEXT)")

def insert_plist_sources(self, plist_file: str, source_files: List[str]):
"""
Inserts the plist file and its associated source files into the
cache database. These source files are located in the 'files' section
of an individual plist file.

Args:
plist_file (str): path to the plist file
source_files (List[str]): list of source files mapped to
the plist file
"""

data = list(zip(itertools.repeat(plist_file), source_files))
self.__cur.executemany("INSERT INTO plist_lookup VALUES(?, ?)", data)
self.__con.commit()

def plist_query(self, source_files: List[str]) -> List[str]:
"""
Returns all plist files associated with any of the given source files
by querying the cache database.

Args:
source_files (List[str]): list of source files to be looked up
from the cache database.
"""

placeholders = ','.join('?' for _ in source_files)
res = self.__cur.execute("SELECT plist FROM plist_lookup WHERE source"
f" IN ({placeholders})", source_files)
return list(set(map(lambda e: e[0], res)))

def get_indexed_plist_files(self) -> List[str]:
"""
Returns already indexed plist files from the cache database.
"""
res = self.__cur.execute("SELECT DISTINCT plist FROM plist_lookup")
return list(map(lambda e: e[0], res))
108 changes: 108 additions & 0 deletions analyzer/codechecker_analyzer/cli/reindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# -------------------------------------------------------------------------
#
# Part of the CodeChecker project, under the Apache License v2.0 with
# LLVM Exceptions. See LICENSE for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# -------------------------------------------------------------------------

import os
import sys
from codechecker_analyzer.cachedb import CacheDB
from codechecker_common import arg, logger
from codechecker_common.compatibility.multiprocessing import Pool, cpu_count
from codechecker_report_converter.report.parser import plist as plistparser
from typing import List, Tuple

LOG = logger.get_logger('system')


def get_argparser_ctor_args():
"""
This method returns a dict containing the kwargs for constructing an
argparse.ArgumentParser (either directly or as a subparser).
"""

return {
'prog': 'CodeChecker reindex',
'formatter_class': arg.RawDescriptionDefaultHelpFormatter,
'description': """
The analysis cache database is a SQLite database located in the
report directory, designed to speed up the parsing process.
In case it is missing or outdated, one can use the 'reindex' command to
recreate/update this database.""",
'help': "Recreate/update the cache database given a report directory."
}


def add_arguments_to_parser(parser):
"""
Add the subcommand's arguments to the given argparse.ArgumentParser.
"""

parser.add_argument('input',
type=str,
nargs='+',
metavar='folder',
help="The analysis result folder(s) containing "
"analysis results which should be "
"reindexed.")

parser.add_argument('-j', '--jobs',
type=int,
dest="jobs",
required=False,
default=cpu_count(),
help="Number of threads to use for reindex. More "
"threads mean faster reindex at the cost of "
"using more memory.")

parser.add_argument('-f', '--force',
action="store_true",
dest="force",
required=False,
default=False,
help="Drop the previous cache database and do a "
"clean reindex.")

logger.add_verbose_arguments(parser)
parser.set_defaults(func=main)


def main(args):
logger.setup_logger(args.verbose if 'verbose' in args else None)
for i in args.input:
update_cache_db(i, args.force, args.jobs)


def __process_file(file_path: str) -> Tuple[str, List[str]]:
with open(file_path, 'rb') as fp:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with open(file_path, 'rb') as fp:
with open(file_path, 'rb', encoding='utf-8', errors='ignore') as fp:

Conventionally we're using utf-8 encoding for every open(). We should get rid of this in the future by setting encoding globally.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are opening the file in binary mode, where encoding and errors options shouldn't be specified, see docs: https://docs.python.org/3/library/functions.html#open

For XML parsing, the plistparser uses lxml, see (tools/report-converter/codechecker_report_converter/report/parser/plist.py, line 104). It accepts the data in binary form, thus we are reading the file in binary mode.

plist = plistparser.parse(fp)

file_list = [] if plist is None else \
plistparser.get_file_list(plist, os.path.dirname(file_path))
return (file_path, file_list)


def update_cache_db(report_dir: str, force: bool, jobs: int):
if not os.path.isdir(report_dir):
LOG.error("Directory %s does not exist!", report_dir)
sys.exit(1)

report_dir = os.path.abspath(report_dir)
cachedb = CacheDB(report_dir, force)
indexed_files = cachedb.get_indexed_plist_files()

plist_files = filter(lambda f: f.endswith(
plistparser.EXTENSION), os.listdir(report_dir))
plist_files = map(lambda f: os.path.abspath(
os.path.join(report_dir, f)), plist_files)
plist_files = list(filter(lambda f: f not in indexed_files, plist_files))

with Pool(jobs) as p:
res = p.map(__process_file, plist_files)
for (plist_file, sources) in res:
if sources != []:
cachedb.insert_plist_sources(plist_file, sources)

cachedb.close_connection()
11 changes: 11 additions & 0 deletions analyzer/tests/functional/reindex/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# coding=utf-8
# -------------------------------------------------------------------------
#
# Part of the CodeChecker project, under the Apache License v2.0 with
# LLVM Exceptions. See LICENSE for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# -------------------------------------------------------------------------

# This file is empty, and is only present so that this directory will form a
# package.
8 changes: 8 additions & 0 deletions analyzer/tests/functional/reindex/test_files/a.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#include <stdio.h>
#include "a.h"

int main()
{
int a = foo();
return 1 / 0;
}
4 changes: 4 additions & 0 deletions analyzer/tests/functional/reindex/test_files/a.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
int foo()
{
return 1 / 0;
}
127 changes: 127 additions & 0 deletions analyzer/tests/functional/reindex/test_reindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#
# -------------------------------------------------------------------------
#
# Part of the CodeChecker project, under the Apache License v2.0 with
# LLVM Exceptions. See LICENSE for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# -------------------------------------------------------------------------

"""
Test reindex functionality.
"""

import os
import shutil
import subprocess
import unittest

from libtest import env
from codechecker_analyzer.cachedb import CacheDB


class TestReindex(unittest.TestCase):
_ccClient = None

def setup_class(self):
"""Setup the environment for the tests."""

global TEST_WORKSPACE
TEST_WORKSPACE = env.get_workspace('reindex')

report_dir = os.path.join(TEST_WORKSPACE, 'reports')
os.makedirs(report_dir)

os.environ['TEST_WORKSPACE'] = TEST_WORKSPACE

def teardown_class(self):
"""Delete the workspace associated with this test"""

# TODO: If environment variable is set keep the workspace
# and print out the path.
global TEST_WORKSPACE

print("Removing: " + TEST_WORKSPACE)
shutil.rmtree(TEST_WORKSPACE)

def setup_method(self, _):

# TEST_WORKSPACE is automatically set by test package __init__.py .
self.test_workspace = os.environ['TEST_WORKSPACE']

test_class = self.__class__.__name__
print('Running ' + test_class + ' tests in ' + self.test_workspace)

# Get the CodeChecker cmd if needed for the tests.
self._codechecker_cmd = env.codechecker_cmd()
self._tu_collector_cmd = env.tu_collector_cmd()
self.report_dir = os.path.join(self.test_workspace, "reports")
self.test_dir = os.path.join(os.path.dirname(__file__), 'test_files')

def __run_cmd(self, cmd, cwd):
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=cwd,
encoding="utf-8",
errors="ignore")
out, err = process.communicate()
print(out)
print(err)
self.assertEqual(process.returncode, 0)

def test_reindex(self):
build_json = os.path.join(self.test_workspace, "build.json")

# Create and run log command
log_cmd = [self._codechecker_cmd, "log", "-b", "gcc a.c",
"-o", build_json]
self.__run_cmd(log_cmd, self.test_dir)

# Create and run analyze command
analyze_cmd = [
self._codechecker_cmd, "analyze", "-c", build_json,
"--analyzers", "clangsa", "-o", self.report_dir]
self.__run_cmd(analyze_cmd, self.test_dir)

plist_files_in_report_dir = [
os.path.join(self.report_dir, f)
for f in os.listdir(self.report_dir)
if os.path.splitext(f)[1] == ".plist"]

# Check if there are plist files in report_dir
self.assertGreaterEqual(len(plist_files_in_report_dir), 1)

a_c_clangsa_plist = None
for f in plist_files_in_report_dir:
if "a.c_clangsa" in f.split("/")[-1]:
a_c_clangsa_plist = f
break

# Check if a.c_clangsa plist was found
self.assertIsNotNone(a_c_clangsa_plist)

# Create and run reindex command
reindex_cmd = [
self._codechecker_cmd, "reindex", "-f", self.report_dir]
self.__run_cmd(reindex_cmd, self.test_dir)

# Check if CacheDB was created
self.assertTrue(os.path.isfile(
os.path.join(self.report_dir, "cache.sqlite")))

# Load CacheDB
cachedb = CacheDB(self.report_dir)

# Check if a.c_clangsa plist was indexed by the reindex command
self.assertIn(a_c_clangsa_plist, cachedb.get_indexed_plist_files())

source_files_in_test_dir = [
os.path.join(self.test_dir, f)
for f in os.listdir(self.test_dir)
if os.path.splitext(f)[1] in [".c", ".h"]]

# Check if source files were mapped to a.c_clangsa plist
for f in source_files_in_test_dir:
self.assertIn(a_c_clangsa_plist, cachedb.plist_query([f]))
Loading
Loading