-
Notifications
You must be signed in to change notification settings - Fork 1
/
gcp-finder.py
68 lines (55 loc) · 2.54 KB
/
gcp-finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import csv
import glob
import json
import pickle
from collections import defaultdict
from tqdm import tqdm
from cstrnfinder import parse_line, check_all
# Findings cache to remove files + code lines duplicates
# doing it because we got like ~33kk results and tons of them are duplicates
# like repository forks or the same 3rd party lib copied around into many projects
#
# key = (dir_and_file, result)
# value = (repo_path)
findings = {}
# Get a dict of GitHub repo stars
# for the used query, see ./bigquery/README.md
print('Loading github-repo-stars.csv')
c = csv.reader(open('./github-repo-stars.csv'))
# skip header
assert next(c) == ['repo_with_stars', 'stars']
repo_stars = dict((i[0], int(i[1])) for i in c)
print('Loaded repo_stars')
funcs_to_find = check_all
dir_and_file = lambda filepath: '/'.join(filepath.split('/')[-2:])
for filepath in tqdm(glob.glob('./gcp-results/*'), desc="File"):
with open(filepath) as f:
for line in tqdm(f, desc="Lineno"):
record = json.loads(line)
#print(record['repo_name'], record['path'])
#record.keys() == 'repo_name', 'path', 'lines'
for finding in record['lines']:
for func in funcs_to_find:
if func not in finding:
continue
else:
# parse_line returns string prefixed with [XXX] for most-likely bugs and [YYY], [ZZZ] or [QQQ] for others
res = parse_line(finding, func, skip_prefix=True)
if res:
# See comment below
#if res[:5] in ('[XXX]', '[YYY]', '[QQQ]', '[ZZZ]'):
# print("%s:%s: %s" % (record['repo_name'], record['path'], res))
# Find only XXX for now as its the most interesting thing.
if res[:5] == '[XXX]':
key = (dir_and_file(record['path']), res)
#print("%s:%s: %s" % (record['repo_name'], record['path'], res))
curr_repo = findings.get(
key, (-2, '')
)
repo_name = record['repo_name']
findings[key] = max(
curr_repo,
(repo_stars.get(repo_name, -1), repo_name)
)
with open('findings', 'wb') as f:
pickle.dump(findings, f)