2323# Configuration - Tool
2424# --------------------
2525
26- VERSION = "1.0 .0"
26+ VERSION = "1.1 .0"
2727
2828ERROR_BAD_PREREQUISITES = 1
2929
@@ -213,14 +213,16 @@ def get_outside_collaborators():
213213 print ("🔨 Total outside collaborators fetched:" , len (collaborators ))
214214 return collaborators
215215
216- def analyze_repositories (repos , year , count_commits ):
216+ def analyze_repositories (repos , year , count_commits , allowed_domains , forbidden_repos ):
217217 """
218218 Analyze repositories to gather various statistics.
219219
220220 Parameters:
221221 - repos (list): A list of repository dictionaries fetched from the GitHub API.
222- - year (int): The year for which to analyze contributions (e.g., 2024 ).
222+ - year (int): The year for which to analyze contributions (e.g., 2025 ).
223223 - count_commits (bool): A flag indicating whether to count commits in the analysis.
224+ - allowed_domains (list): List of allowed email domains (e.g., ["@orange.com", "@sofrecom.com"])
225+ - forbidden_repos (list): List of repository names to ignore (because of too big, no contributions, etc.) (e.g., ["Orange-OpenSource/linux"])
224226
225227 Returns:
226228 dict: A dictionary containing various statistics, including:
@@ -237,17 +239,35 @@ def analyze_repositories(repos, year, count_commits):
237239 - year_repos (int): Number of repositories created in the specified year.
238240 - organization_forks_year (int): Number of forks created by the organization the given year.
239241 - total_commits (int): Total number of commits across all repositories.
240- - top_repos (list): Top 3 repositories by commits.
241- - top_contributors_overall (list): Top 5 contributors overall.
242- - top_contributors_yearly (list): Top 10 contributors for the specified year.
243- - least_used_languages (list): 3 least used programming languages.
242+ - top_repos (list): Top repositories by commits.
243+ - top_contributors_overall (list): Top contributors overall (filtered by domain) .
244+ - top_contributors_yearly (list): Top contributors for the specified year (filtered by domain) .
245+ - least_used_languages (list): Least used programming languages.
244246 - largest_projects (dict): Largest project for each programming language.
247+ - filtered_commits_yearly (int): Number of commits by allowed domain users for the year.
248+ - filtered_commits_total (int): Total number of commits by allowed domain users.
249+ - skipped_repos (list): List of repositories that were skipped.
245250 """
246251 print ("🔨 Analyzing repositories..." )
247- total_repos = len (repos ) # Total number of repositories
248- archived_repos = sum (1 for repo in repos if repo ['archived' ]) # Count archived repositories
249- forked_repos = sum (1 for repo in repos if repo ['fork' ]) # Count forked repositories
250- non_forked_repos = total_repos - forked_repos # Count non-forked repositories
252+
253+ print (f"🔨 Filtering contributors by domains: { allowed_domains } " )
254+ print (f"🔨 Ignoring forbidden repositories: { forbidden_repos } " )
255+
256+ def is_allowed_email (email ):
257+ """Check if an email belongs to allowed domains."""
258+ if not email :
259+ return False
260+ return any (email .lower ().endswith (domain .lower ()) for domain in allowed_domains )
261+
262+ def is_forbidden_repo (repo_full_name ):
263+ """Check if a repository is in the forbidden list."""
264+ return repo_full_name in forbidden_repos
265+
266+ # Basic repository statistics
267+ total_repos = len (repos )
268+ archived_repos = sum (1 for repo in repos if repo ['archived' ])
269+ forked_repos = sum (1 for repo in repos if repo ['fork' ])
270+ non_forked_repos = total_repos - forked_repos
251271
252272 # Total forks count from all repositories
253273 total_forks = sum (repo ['forks_count' ] for repo in repos )
@@ -260,85 +280,141 @@ def analyze_repositories(repos, year, count_commits):
260280 most_stars_repo = max (repos , key = lambda r : r ['stargazers_count' ], default = None )
261281 most_forks_repo = max (repos , key = lambda r : r ['forks_count' ], default = None )
262282
263- # Count programming languages used in the repositories
283+ # Initialize data structures for language and contributor analysis
264284 languages = Counter ()
265- largest_projects = {} # To track the largest project for each language
266- total_contributor_commits = defaultdict (int ) # Total contributions
267- yearly_contributor_commits = defaultdict (int ) # Contributions for the specified year
285+ largest_projects = {}
286+ total_contributor_commits = defaultdict (int ) # Total contributions (filtered)
287+ yearly_contributor_commits = defaultdict (int ) # Yearly contributions (filtered)
268288
289+ # Analyze programming language
269290 for repo in repos :
270- if repo ['language' ] and not repo [ 'fork' ] and not repo [ 'archived' ]: # Exclude forks and archived repos
291+ if repo ['language' ]:
271292 languages [repo ['language' ]] += 1
272- # Track the largest project for each language
273293 if repo ['language' ] not in largest_projects or repo ['size' ] > largest_projects [repo ['language' ]]['size' ]:
274294 largest_projects [repo ['language' ]] = {'name' : repo ['full_name' ], 'size' : repo ['size' ]}
275295
276- # Get the top 5 languages used
296+ # Get the top programming languages used
277297 top_languages = languages .most_common (int (TOP_N_PROG_LANG ))
278298 total_lines = {lang : 0 for lang , _ in top_languages }
279299
280300 # Estimate total lines of code for top languages
281301 for repo in repos :
282- if repo ['language' ] in total_lines and not repo [ 'fork' ] and not repo [ 'archived' ] :
302+ if repo ['language' ] in total_lines :
283303 total_lines [repo ['language' ]] += repo ['size' ]
284304
285- # Count repositories created in a specific year
305+ # Count repositories created in the specified year
286306 year_repos = sum (1 for repo in repos if datetime .strptime (repo ['created_at' ], '%Y-%m-%dT%H:%M:%SZ' ).year == year )
287307
288- # Count forks created by the organization (i.e., forks of other repositories)
308+ # Count forks created by the organization
289309 organization_forks = sum (1 for repo in repos if repo ['fork' ])
290310
291311 # Count licenses used in the repositories
292312 licenses = Counter (repo ['license' ]['name' ] for repo in repos if repo ['license' ])
293313 top_licenses = licenses .most_common (int (TOP_N_LICENSES ))
294314
295- # Calculate total commits across all repositories if enabled
296- total_commits = 0
297- commits_per_repo = {}
315+ # Initialize commit-related variables
316+ total_commits = 0 # Total commits (all contributors)
317+ commits_per_repo = {} # Commits per repo (all contributors)
318+ filtered_commits_yearly = 0 # Commits by allowed domain users (yearly)
319+ filtered_commits_total = 0 # Commits by allowed domain users (total)
320+ skipped_repos = [] # List of skipped repositories
298321
322+ # Calculate commits and contributor statistics if enabled
299323 if count_commits :
300324 for index , repo in enumerate (repos , start = 1 ):
301- # Only count commits for non-forked and non-archived repositories
302- # Some forks are for exmaple from linux project, to much noise in the data
303- if not repo ['fork' ] and not repo ['archived' ]:
304- # Get commits for the specified year
305- commits_count = get_commits_count (repo ['full_name' ], year )
306- total_commits += commits_count
307- commits_per_repo [repo ['full_name' ]] = commits_count
325+ # Ignore repos to ignore
326+ if is_forbidden_repo (repo ['full_name' ]):
327+ print (f"🚫 SKIPPING forbidden repository { index } /{ total_repos } : { repo ['full_name' ]} " )
328+ skipped_repos .append (repo ['full_name' ])
329+ continue
330+
331+ print (f"🔨 Analyzing repository { index } /{ total_repos } : { repo ['full_name' ]} { '(FORK)' if repo ['fork' ] else '' } { '(ARCHIVED)' if repo ['archived' ] else '' } " )
332+
333+ # Define date range for the specified year
334+ since = f"{ year } -01-01T00:00:00Z"
335+ until = f"{ year + 1 } -01-01T00:00:00Z"
336+
337+ # Stats for specified year
338+ yearly_commits_url = f"https://api.github.com/repos/{ repo ['full_name' ]} /commits"
339+ page = 1
340+ yearly_commits_count = 0
341+ yearly_filtered_commits = 0
342+
343+ print (f"🔨 Fetching yearly commits and contributors for { repo ['full_name' ]} ..." )
344+ while True :
345+ response = requests .get (f"{ yearly_commits_url } ?page={ page } &per_page=100&since={ since } &until={ until } " , headers = HEADERS )
346+ if response .status_code != 200 :
347+ print (f"❌ Failed to fetch yearly commits for { repo ['full_name' ]} , status code:" , response .status_code )
348+ break
349+ commits_data = response .json ()
350+ if not commits_data :
351+ break
352+
353+ # Stats for all years
354+ if author_email and is_allowed_email (author_email ):
355+ yearly_filtered_commits += 1
308356
309- # Get all commits for total contributor calculation
310- all_commits_url = f"https://api.github.com/repos/{ repo ['full_name' ]} /commits"
311- page = 1
312- while True :
313- response = requests .get (f"{ all_commits_url } ?page={ page } &per_page=100" , headers = HEADERS )
314- if response .status_code != 200 :
315- break
316- commits_data = response .json ()
317- if not commits_data :
318- break
319- for commit in commits_data :
320- author = commit ['commit' ]['author' ]['name' ]
321- total_contributor_commits [author ] += 1 # Total contributions
322- # Check if the commit is in the specified year
323- commit_date = commit ['commit' ]['author' ]['date' ]
324- if year == datetime .strptime (commit_date , '%Y-%m-%dT%H:%M:%SZ' ).year :
325- yearly_contributor_commits [author ] += 1 # Contributions for the specified year
326- page += 1
327- print (f"🔨 Analyzing repository { index } /{ total_repos } " )
328-
329- # Get the top N contributors overall
357+ # Analyse contributions by year (with domain filtering)
358+ for commit in commits_data :
359+ if commit .get ('commit' ) and commit ['commit' ].get ('author' ):
360+ author_name = commit ['commit' ]['author' ].get ('name' )
361+ author_email = commit ['commit' ]['author' ].get ('email' )
362+
363+ # Email domain filtering
364+ if author_email and is_allowed_email (author_email ):
365+ if author_name :
366+ yearly_contributor_commits [f"{ author_name } ({ author_email } )" ] += 1
367+ yearly_filtered_commits += 1
368+ page += 1
369+
370+ # Store only commits count for allowed domains
371+ total_commits += yearly_filtered_commits
372+ commits_per_repo [repo ['full_name' ]] = yearly_filtered_commits
373+
374+ # Compute all commits for all contributors
375+ print (f"🔨 Fetching all-time contributors for { repo ['full_name' ]} ..." )
376+ all_commits_url = f"https://api.github.com/repos/{ repo ['full_name' ]} /commits"
377+ page = 1
378+
379+ while True :
380+ response = requests .get (f"{ all_commits_url } ?page={ page } &per_page=100" , headers = HEADERS )
381+ if response .status_code != 200 :
382+ print (f"❌ Failed to fetch all commits for { repo ['full_name' ]} , status code:" , response .status_code )
383+ break
384+ commits_data = response .json ()
385+ if not commits_data :
386+ break
387+
388+ # Analyse overall contributions (with domain filtering)
389+ for commit in commits_data :
390+ if commit .get ('commit' ) and commit ['commit' ].get ('author' ):
391+ author_name = commit ['commit' ]['author' ].get ('name' )
392+ author_email = commit ['commit' ]['author' ].get ('email' )
393+
394+ # Filter by email domain
395+ if author_email and is_allowed_email (author_email ):
396+ if author_name :
397+ total_contributor_commits [f"{ author_name } ({ author_email } )" ] += 1
398+ page += 1
399+
400+ # Calculate top contributors (with filtering on domains)
330401 top_contributors_overall = sorted (total_contributor_commits .items (), key = lambda x : x [1 ], reverse = True )[:int (TOP_N_CONTRIBUTORS_OVERALL )]
331-
332- # Get the top N contributors for the specified year
333402 top_contributors_yearly = sorted (yearly_contributor_commits .items (), key = lambda x : x [1 ], reverse = True )[:int (TOP_N_CONTRIBUTORS_FOR_YEAR )]
334403
335- # Get the top N repositories with the most commits
336- top_repos = sorted (commits_per_repo .items (), key = lambda x : x [1 ], reverse = True )[:int (TOP_N_REPOS_MOST_COMMITS )]
404+ # Get the top repositories with the most commits (only allowed commits)
405+ # Filter repos with no commits from allowed domains
406+ filtered_commits_per_repo = {repo : commits for repo , commits in commits_per_repo .items () if commits > 0 }
407+ top_repos = sorted (filtered_commits_per_repo .items (), key = lambda x : x [1 ], reverse = True )[:int (TOP_N_REPOS_MOST_COMMITS )]
337408
338- # Get the N least used programming languages
409+ # Get the least used programming languages
339410 least_used_languages = sorted (languages .items (), key = lambda x : x [1 ])[:int (TOP_N_LEAST_PROG_LANG )]
340411
341412 print ("🔨 Analysis complete." )
413+ print (f"🔨 Total commits (all contributors): { total_commits } " )
414+ print (f"🔨 Filtered commits yearly (allowed domains): { filtered_commits_yearly } " )
415+ print (f"🔨 Filtered commits total (allowed domains): { filtered_commits_total } " )
416+ print (f"🔨 Skipped repositories: { len (skipped_repos )} " )
417+
342418 return {
343419 "total_repos" : total_repos ,
344420 "archived_repos" : archived_repos ,
@@ -357,9 +433,13 @@ def analyze_repositories(repos, year, count_commits):
357433 "top_contributors_overall" : top_contributors_overall ,
358434 "top_contributors_yearly" : top_contributors_yearly ,
359435 "least_used_languages" : least_used_languages ,
360- "largest_projects" : largest_projects
436+ "largest_projects" : largest_projects ,
437+ "filtered_commits_yearly" : filtered_commits_yearly ,
438+ "filtered_commits_total" : filtered_commits_total ,
439+ "skipped_repos" : skipped_repos
361440 }
362441
442+
363443def get_total_members ():
364444 """Get the total number of members including outside collaborators.
365445
@@ -397,7 +477,7 @@ def main():
397477
398478 # Parse command-line arguments
399479 parser = argparse .ArgumentParser (description = f"Analyze GitHub organization repositories (Version: { VERSION } )." )
400- parser .add_argument ("--year" , type = int , required = True , help = "The year to analyze (e.g., 2024 )." )
480+ parser .add_argument ("--year" , type = int , required = True , help = "The year to analyze (e.g., 2025 )." )
401481 parser .add_argument ("--count-commits" , action = 'store_true' , help = "Enable commit counting in the analysis." )
402482 args = parser .parse_args ()
403483
@@ -407,8 +487,13 @@ def main():
407487 total_members = get_total_members ()
408488 visible_members = len (get_members ())
409489
490+ # Compute only employees / subscontractors / affiliates addresses
491+ allowed_domains = ["@orange.com" , "@sofrecom.com" , "@groupeonepoint.com" , "@sciencespo.fr" , "@inria.fr" ]
492+ # Exclude some repositories if they pollute results
493+ forbidden_repos = [ "Orange-OpenSource/linux" ]
494+
410495 # Analyze repositories for the specified year
411- analysis = analyze_repositories (repos , args .year , args .count_commits )
496+ analysis = analyze_repositories (repos , args .year , args .count_commits , allowed_domains , forbidden_repos )
412497
413498 # Estimate the number of private members
414499 private_members_count = estimate_private_members (total_members , visible_members )
0 commit comments