1
+ import git
2
+ import csv
3
+ import json
4
+
5
+ from collections import defaultdict
6
+ import json
7
+ import os
8
+ from collections import defaultdict
9
+ import subprocess
10
+
11
+ oracle_commits = defaultdict (list )
12
+ oracle_stats = defaultdict (list )
13
+ empty_commits = defaultdict (list )
14
+
15
+ def is_git_diff_empty (commit , parentCommit , filepath , repo_path , fileName ):
16
+ if not parentCommit :
17
+ return False
18
+ try :
19
+ # Run the git diff command with --ignore-cr-at-eol flag
20
+ result_ig_whitespace = subprocess .run (
21
+ ['git' , '-C' , repo_path , 'diff' , '--stat' , '--ignore-cr-at-eol' , parentCommit , commit , '--' , filepath ],
22
+ stdout = subprocess .PIPE ,
23
+ stderr = subprocess .PIPE ,
24
+ text = True ,
25
+ check = True
26
+ )
27
+
28
+ isEmpty = result_ig_whitespace .stdout .strip () == ''
29
+
30
+ if not isEmpty :
31
+ result = subprocess .run (
32
+ ['git' , '-C' , repo_path , 'diff' , '--stat' , parentCommit , commit , '--' , filepath ],
33
+ stdout = subprocess .PIPE ,
34
+ stderr = subprocess .PIPE ,
35
+ text = True ,
36
+ check = True
37
+ )
38
+ if (result .stdout .strip () == result_ig_whitespace .stdout .strip ()):
39
+ return False
40
+
41
+ info = result .stdout .strip ().split ("\n " )[1 ].split (", " )
42
+ diff_lines = 0
43
+ if (len (info ) < 3 ):
44
+ diff_lines = int (info [1 ].split (" " )[0 ])
45
+ else :
46
+ diff_lines = max (int (info [1 ].split (" " )[0 ]), int (info [2 ].split (" " )[0 ]))
47
+
48
+ ig_whitespace = int (result_ig_whitespace .stdout .strip ().split ("\n " )[0 ].split (" | " )[1 ].split (" " )[0 ])
49
+ whitespace_percentage = (diff_lines - ig_whitespace )/ diff_lines
50
+ if (whitespace_percentage > 0.95 ):
51
+ isEmpty = True
52
+
53
+
54
+ if (isEmpty ):
55
+ empty_commits [fileName ].append (commit )
56
+ return isEmpty
57
+ except subprocess .CalledProcessError as e :
58
+ print (f"An error occurred while running git diff: { e .stderr } " )
59
+ return False
60
+
61
+ def write_to_csv (data , filename ):
62
+ with open (filename , mode = 'w' , newline = '' ) as file :
63
+ writer = csv .writer (file )
64
+ writer .writerow (["Name" , "TP" , "FP" , "FN" , "Empty commits count" ])
65
+ for fileName , stats in data .items ():
66
+ writer .writerow ([fileName , stats [0 ], stats [1 ], stats [2 ], stats [3 ]])
67
+
68
+ def getLogCommits (fileName , repo_path , start_commit , file_path , start_line , end_line , introduction_commit , mappings_file , manual_empty_file ):
69
+ repo = git .Repo (repo_path )
70
+ log = repo .git .log (start_commit , '-L' , f'{ start_line } ,{ end_line } :{ file_path } ' )
71
+
72
+ commitIds = []
73
+ process = True
74
+ empty_commits_count = 0
75
+ while process :
76
+ process = False
77
+ for line in log .split ('\n ' ):
78
+ if ('commit ' in line and len (line ) == 47 ):
79
+ _ , commitId = line .split (' ' )
80
+ if (len (commitId ) == 40 ):
81
+ # if a file is known to have an "empty" commit in its first set of commits
82
+ # returned by gitLog
83
+ if (fileName in mappings_file and commitId in mappings_file [fileName ]):
84
+ info = mappings_file [fileName ][commitId ]
85
+ # check if commit is known to be "empty" or if its
86
+ # a new commit (succeeding the original empty commit) that could be "empty" by running git diff
87
+ if ((fileName in manual_empty_file and commitId in manual_empty_file [fileName ]) or
88
+ (is_git_diff_empty (commitId , info ['parent_commit_id' ], info ['element_file_before' ], repo_path , fileName ))):
89
+ empty_commits_count += 1
90
+ sl , el = info ['element_name_after' ].split ('$' )[1 ].split ('(' )[1 ].split (')' )[0 ].split ('-' )
91
+ log = repo .git .log (info ['parent_commit_id' ], '-L' , f'{ sl } ,{ el } :{ info ["element_file_after" ]} ' )
92
+ process = True
93
+ break
94
+ commitIds .append (commitId )
95
+ if (commitId == introduction_commit ):
96
+ return commitIds , empty_commits_count
97
+
98
+ return commitIds , empty_commits_count
99
+
100
+ def get_file_names_in_directory (directory_path ):
101
+ file_names = []
102
+ for file in os .listdir (directory_path ):
103
+ if os .path .isfile (os .path .join (directory_path , file )):
104
+ file_names .append (os .path .basename (file ))
105
+ return file_names
106
+
107
+ def load_json_file (file_path ):
108
+ with open (file_path , 'r' ) as file :
109
+ return json .load (file )
110
+
111
+ def getMethodName (signature ):
112
+ return signature .split ("#" )[1 ].split ("(" )[0 ]
113
+
114
+ def getFileName (signature ):
115
+ splitArray = signature .split ("/" )
116
+ return splitArray [len (splitArray ) - 1 ]
117
+
118
+ def scanJSON (fileName , data , mappings_file , manual_empty_file ):
119
+ print (f"Processing { fileName } " )
120
+ start_commit = data .get ('startCommitId' )
121
+ start_line = data .get ('blockStartLine' )
122
+ end_line = data .get ('blockEndLine' )
123
+ file_path = data .get ('filePath' )
124
+
125
+ repo_name = data .get ('repositoryWebURL' )
126
+ repo_name = repo_name .replace ('https://github.com/' , '' )
127
+ repo_name = repo_name .replace ('.git' , '' )
128
+
129
+ introduction_commit = data .get ('expectedChanges' )[- 1 ].get ("commitId" )
130
+
131
+ repo_path = '../code-tracker/tmp/' + repo_name
132
+
133
+ tp = 0
134
+ fp = 0
135
+ fn = 0
136
+ oracleChanges = data .get ('expectedChanges' , [])
137
+ oracleCommitIds = set ()
138
+ for change in oracleChanges :
139
+ oracleCommitIds .add (change .get ('commitId' ))
140
+
141
+ commitIds , empty_commits_count = getLogCommits (fileName , repo_path , start_commit , file_path , start_line , end_line , introduction_commit , mappings_file , manual_empty_file )
142
+
143
+ for commitId in commitIds :
144
+ if (commitId in oracleCommitIds ):
145
+ tp += 1
146
+ if (not (commitId in oracleCommitIds )):
147
+ fp += 1
148
+
149
+ commitIdsSet = set (commitIds )
150
+
151
+ for oracleCommitId in oracleCommitIds :
152
+ if (not (oracleCommitId in commitIdsSet )):
153
+ fn += 1
154
+
155
+ stats = [tp , fp , fn , empty_commits_count ]
156
+
157
+ oracle_commits [fileName ] = commitIds
158
+ oracle_stats [fileName ] = stats
159
+ return True
160
+
161
+
162
+ def processFile (oracle_file , mappings_file , manual_empty_file ):
163
+ oracle_file1 = "./oracle/block/training/" + oracle_file
164
+
165
+ oracle1 = load_json_file (oracle_file1 )
166
+ return scanJSON (oracle_file , oracle1 , mappings_file , manual_empty_file )
167
+
168
+
169
+ if __name__ == "__main__" :
170
+ commonChanges = defaultdict (int )
171
+ addedChanges = defaultdict (int )
172
+ deletedChanges = defaultdict (int )
173
+
174
+ directory_path = "../../src/main/resources/oracle/block/training/"
175
+ file_names = get_file_names_in_directory (directory_path )
176
+ mappings_file = load_json_file ("./mappings/training-mappings.json" )
177
+ manual_empty_file = load_json_file ("./mappings/empty-commits-training.json" )
178
+
179
+ for file_name in file_names :
180
+ processFile (file_name , mappings_file , manual_empty_file )
181
+
182
+ write_to_csv (oracle_stats , './stats/gitLog-training-breakdown.csv' )
183
+ with open ('./stats/gitLog-training.json' , 'w' ) as json_file :
184
+ json .dump (oracle_commits , json_file )
185
+ json_file .write ('\n ' )
186
+ with open ('./stats/empty-commits-training.json' , 'w' ) as json_file :
187
+ json .dump (empty_commits , json_file )
188
+ json_file .write ('\n ' )
0 commit comments