Skip to content

Commit b4587f5

Browse files
committed
fix: missing contributors in overall stats for GitHub year review (#206)
- Do not ignore forks and archived repositories - Filter authors by email domain to keep only interesting metrics (i.e. ignore contributors of forks for example) - Ignore some projects (if too heavy, no contributions, etc.) Closes #206 Assisted-by: Claude Sonnet 4 (Orange Dinootoo) Signed-off-by: Pierre-Yves Lapersonne <pierreyves.lapersonne@orange.com>
1 parent 81f02b3 commit b4587f5

File tree

1 file changed

+145
-60
lines changed

1 file changed

+145
-60
lines changed

toolbox/github/github-year-review.py

Lines changed: 145 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# Configuration - Tool
2424
# --------------------
2525

26-
VERSION = "1.0.0"
26+
VERSION = "1.1.0"
2727

2828
ERROR_BAD_PREREQUISITES = 1
2929

@@ -213,14 +213,16 @@ def get_outside_collaborators():
213213
print("🔨 Total outside collaborators fetched:", len(collaborators))
214214
return collaborators
215215

216-
def analyze_repositories(repos, year, count_commits):
216+
def analyze_repositories(repos, year, count_commits, allowed_domains, forbidden_repos):
217217
"""
218218
Analyze repositories to gather various statistics.
219219
220220
Parameters:
221221
- repos (list): A list of repository dictionaries fetched from the GitHub API.
222-
- year (int): The year for which to analyze contributions (e.g., 2024).
222+
- year (int): The year for which to analyze contributions (e.g., 2025).
223223
- count_commits (bool): A flag indicating whether to count commits in the analysis.
224+
- allowed_domains (list): List of allowed email domains (e.g., ["@orange.com", "@sofrecom.com"])
225+
- forbidden_repos (list): List of repository names to ignore (because of too big, no contributions, etc.) (e.g., ["Orange-OpenSource/linux"])
224226
225227
Returns:
226228
dict: A dictionary containing various statistics, including:
@@ -237,17 +239,35 @@ def analyze_repositories(repos, year, count_commits):
237239
- year_repos (int): Number of repositories created in the specified year.
238240
- organization_forks_year (int): Number of forks created by the organization the given year.
239241
- total_commits (int): Total number of commits across all repositories.
240-
- top_repos (list): Top 3 repositories by commits.
241-
- top_contributors_overall (list): Top 5 contributors overall.
242-
- top_contributors_yearly (list): Top 10 contributors for the specified year.
243-
- least_used_languages (list): 3 least used programming languages.
242+
- top_repos (list): Top repositories by commits.
243+
- top_contributors_overall (list): Top contributors overall (filtered by domain).
244+
- top_contributors_yearly (list): Top contributors for the specified year (filtered by domain).
245+
- least_used_languages (list): Least used programming languages.
244246
- largest_projects (dict): Largest project for each programming language.
247+
- filtered_commits_yearly (int): Number of commits by allowed domain users for the year.
248+
- filtered_commits_total (int): Total number of commits by allowed domain users.
249+
- skipped_repos (list): List of repositories that were skipped.
245250
"""
246251
print("🔨 Analyzing repositories...")
247-
total_repos = len(repos) # Total number of repositories
248-
archived_repos = sum(1 for repo in repos if repo['archived']) # Count archived repositories
249-
forked_repos = sum(1 for repo in repos if repo['fork']) # Count forked repositories
250-
non_forked_repos = total_repos - forked_repos # Count non-forked repositories
252+
253+
print(f"🔨 Filtering contributors by domains: {allowed_domains}")
254+
print(f"🔨 Ignoring forbidden repositories: {forbidden_repos}")
255+
256+
def is_allowed_email(email):
257+
"""Check if an email belongs to allowed domains."""
258+
if not email:
259+
return False
260+
return any(email.lower().endswith(domain.lower()) for domain in allowed_domains)
261+
262+
def is_forbidden_repo(repo_full_name):
263+
"""Check if a repository is in the forbidden list."""
264+
return repo_full_name in forbidden_repos
265+
266+
# Basic repository statistics
267+
total_repos = len(repos)
268+
archived_repos = sum(1 for repo in repos if repo['archived'])
269+
forked_repos = sum(1 for repo in repos if repo['fork'])
270+
non_forked_repos = total_repos - forked_repos
251271

252272
# Total forks count from all repositories
253273
total_forks = sum(repo['forks_count'] for repo in repos)
@@ -260,85 +280,141 @@ def analyze_repositories(repos, year, count_commits):
260280
most_stars_repo = max(repos, key=lambda r: r['stargazers_count'], default=None)
261281
most_forks_repo = max(repos, key=lambda r: r['forks_count'], default=None)
262282

263-
# Count programming languages used in the repositories
283+
# Initialize data structures for language and contributor analysis
264284
languages = Counter()
265-
largest_projects = {} # To track the largest project for each language
266-
total_contributor_commits = defaultdict(int) # Total contributions
267-
yearly_contributor_commits = defaultdict(int) # Contributions for the specified year
285+
largest_projects = {}
286+
total_contributor_commits = defaultdict(int) # Total contributions (filtered)
287+
yearly_contributor_commits = defaultdict(int) # Yearly contributions (filtered)
268288

289+
# Analyze programming language
269290
for repo in repos:
270-
if repo['language'] and not repo['fork'] and not repo['archived']: # Exclude forks and archived repos
291+
if repo['language']:
271292
languages[repo['language']] += 1
272-
# Track the largest project for each language
273293
if repo['language'] not in largest_projects or repo['size'] > largest_projects[repo['language']]['size']:
274294
largest_projects[repo['language']] = {'name': repo['full_name'], 'size': repo['size']}
275295

276-
# Get the top 5 languages used
296+
# Get the top programming languages used
277297
top_languages = languages.most_common(int(TOP_N_PROG_LANG))
278298
total_lines = {lang: 0 for lang, _ in top_languages}
279299

280300
# Estimate total lines of code for top languages
281301
for repo in repos:
282-
if repo['language'] in total_lines and not repo['fork'] and not repo['archived']:
302+
if repo['language'] in total_lines:
283303
total_lines[repo['language']] += repo['size']
284304

285-
# Count repositories created in a specific year
305+
# Count repositories created in the specified year
286306
year_repos = sum(1 for repo in repos if datetime.strptime(repo['created_at'], '%Y-%m-%dT%H:%M:%SZ').year == year)
287307

288-
# Count forks created by the organization (i.e., forks of other repositories)
308+
# Count forks created by the organization
289309
organization_forks = sum(1 for repo in repos if repo['fork'])
290310

291311
# Count licenses used in the repositories
292312
licenses = Counter(repo['license']['name'] for repo in repos if repo['license'])
293313
top_licenses = licenses.most_common(int(TOP_N_LICENSES))
294314

295-
# Calculate total commits across all repositories if enabled
296-
total_commits = 0
297-
commits_per_repo = {}
315+
# Initialize commit-related variables
316+
total_commits = 0 # Total commits (all contributors)
317+
commits_per_repo = {} # Commits per repo (all contributors)
318+
filtered_commits_yearly = 0 # Commits by allowed domain users (yearly)
319+
filtered_commits_total = 0 # Commits by allowed domain users (total)
320+
skipped_repos = [] # List of skipped repositories
298321

322+
# Calculate commits and contributor statistics if enabled
299323
if count_commits:
300324
for index, repo in enumerate(repos, start=1):
301-
# Only count commits for non-forked and non-archived repositories
302-
# Some forks are for exmaple from linux project, to much noise in the data
303-
if not repo['fork'] and not repo['archived']:
304-
# Get commits for the specified year
305-
commits_count = get_commits_count(repo['full_name'], year)
306-
total_commits += commits_count
307-
commits_per_repo[repo['full_name']] = commits_count
325+
# Ignore repos to ignore
326+
if is_forbidden_repo(repo['full_name']):
327+
print(f"🚫 SKIPPING forbidden repository {index}/{total_repos}: {repo['full_name']}")
328+
skipped_repos.append(repo['full_name'])
329+
continue
330+
331+
print(f"🔨 Analyzing repository {index}/{total_repos}: {repo['full_name']} {'(FORK)' if repo['fork'] else ''} {'(ARCHIVED)' if repo['archived'] else ''}")
332+
333+
# Define date range for the specified year
334+
since = f"{year}-01-01T00:00:00Z"
335+
until = f"{year + 1}-01-01T00:00:00Z"
336+
337+
# Stats for specified year
338+
yearly_commits_url = f"https://api.github.com/repos/{repo['full_name']}/commits"
339+
page = 1
340+
yearly_commits_count = 0
341+
yearly_filtered_commits = 0
342+
343+
print(f"🔨 Fetching yearly commits and contributors for {repo['full_name']}...")
344+
while True:
345+
response = requests.get(f"{yearly_commits_url}?page={page}&per_page=100&since={since}&until={until}", headers=HEADERS)
346+
if response.status_code != 200:
347+
print(f"❌ Failed to fetch yearly commits for {repo['full_name']}, status code:", response.status_code)
348+
break
349+
commits_data = response.json()
350+
if not commits_data:
351+
break
352+
353+
# Stats for all years
354+
if author_email and is_allowed_email(author_email):
355+
yearly_filtered_commits += 1
308356

309-
# Get all commits for total contributor calculation
310-
all_commits_url = f"https://api.github.com/repos/{repo['full_name']}/commits"
311-
page = 1
312-
while True:
313-
response = requests.get(f"{all_commits_url}?page={page}&per_page=100", headers=HEADERS)
314-
if response.status_code != 200:
315-
break
316-
commits_data = response.json()
317-
if not commits_data:
318-
break
319-
for commit in commits_data:
320-
author = commit['commit']['author']['name']
321-
total_contributor_commits[author] += 1 # Total contributions
322-
# Check if the commit is in the specified year
323-
commit_date = commit['commit']['author']['date']
324-
if year == datetime.strptime(commit_date, '%Y-%m-%dT%H:%M:%SZ').year:
325-
yearly_contributor_commits[author] += 1 # Contributions for the specified year
326-
page += 1
327-
print(f"🔨 Analyzing repository {index}/{total_repos}")
328-
329-
# Get the top N contributors overall
357+
# Analyse contributions by year (with domain filtering)
358+
for commit in commits_data:
359+
if commit.get('commit') and commit['commit'].get('author'):
360+
author_name = commit['commit']['author'].get('name')
361+
author_email = commit['commit']['author'].get('email')
362+
363+
# Email domain filtering
364+
if author_email and is_allowed_email(author_email):
365+
if author_name:
366+
yearly_contributor_commits[f"{author_name} ({author_email})"] += 1
367+
yearly_filtered_commits += 1
368+
page += 1
369+
370+
# Store only commits count for allowed domains
371+
total_commits += yearly_filtered_commits
372+
commits_per_repo[repo['full_name']] = yearly_filtered_commits
373+
374+
# Compute all commits for all contributors
375+
print(f"🔨 Fetching all-time contributors for {repo['full_name']}...")
376+
all_commits_url = f"https://api.github.com/repos/{repo['full_name']}/commits"
377+
page = 1
378+
379+
while True:
380+
response = requests.get(f"{all_commits_url}?page={page}&per_page=100", headers=HEADERS)
381+
if response.status_code != 200:
382+
print(f"❌ Failed to fetch all commits for {repo['full_name']}, status code:", response.status_code)
383+
break
384+
commits_data = response.json()
385+
if not commits_data:
386+
break
387+
388+
# Analyse overall contributions (with domain filtering)
389+
for commit in commits_data:
390+
if commit.get('commit') and commit['commit'].get('author'):
391+
author_name = commit['commit']['author'].get('name')
392+
author_email = commit['commit']['author'].get('email')
393+
394+
# Filter by email domain
395+
if author_email and is_allowed_email(author_email):
396+
if author_name:
397+
total_contributor_commits[f"{author_name} ({author_email})"] += 1
398+
page += 1
399+
400+
# Calculate top contributors (with filtering on domains)
330401
top_contributors_overall = sorted(total_contributor_commits.items(), key=lambda x: x[1], reverse=True)[:int(TOP_N_CONTRIBUTORS_OVERALL)]
331-
332-
# Get the top N contributors for the specified year
333402
top_contributors_yearly = sorted(yearly_contributor_commits.items(), key=lambda x: x[1], reverse=True)[:int(TOP_N_CONTRIBUTORS_FOR_YEAR)]
334403

335-
# Get the top N repositories with the most commits
336-
top_repos = sorted(commits_per_repo.items(), key=lambda x: x[1], reverse=True)[:int(TOP_N_REPOS_MOST_COMMITS)]
404+
# Get the top repositories with the most commits (only allowed commits)
405+
# Filter repos with no commits from allowed domains
406+
filtered_commits_per_repo = {repo: commits for repo, commits in commits_per_repo.items() if commits > 0}
407+
top_repos = sorted(filtered_commits_per_repo.items(), key=lambda x: x[1], reverse=True)[:int(TOP_N_REPOS_MOST_COMMITS)]
337408

338-
# Get the N least used programming languages
409+
# Get the least used programming languages
339410
least_used_languages = sorted(languages.items(), key=lambda x: x[1])[:int(TOP_N_LEAST_PROG_LANG)]
340411

341412
print("🔨 Analysis complete.")
413+
print(f"🔨 Total commits (all contributors): {total_commits}")
414+
print(f"🔨 Filtered commits yearly (allowed domains): {filtered_commits_yearly}")
415+
print(f"🔨 Filtered commits total (allowed domains): {filtered_commits_total}")
416+
print(f"🔨 Skipped repositories: {len(skipped_repos)}")
417+
342418
return {
343419
"total_repos": total_repos,
344420
"archived_repos": archived_repos,
@@ -357,9 +433,13 @@ def analyze_repositories(repos, year, count_commits):
357433
"top_contributors_overall": top_contributors_overall,
358434
"top_contributors_yearly": top_contributors_yearly,
359435
"least_used_languages": least_used_languages,
360-
"largest_projects": largest_projects
436+
"largest_projects": largest_projects,
437+
"filtered_commits_yearly": filtered_commits_yearly,
438+
"filtered_commits_total": filtered_commits_total,
439+
"skipped_repos": skipped_repos
361440
}
362441

442+
363443
def get_total_members():
364444
"""Get the total number of members including outside collaborators.
365445
@@ -397,7 +477,7 @@ def main():
397477

398478
# Parse command-line arguments
399479
parser = argparse.ArgumentParser(description=f"Analyze GitHub organization repositories (Version: {VERSION}).")
400-
parser.add_argument("--year", type=int, required=True, help="The year to analyze (e.g., 2024).")
480+
parser.add_argument("--year", type=int, required=True, help="The year to analyze (e.g., 2025).")
401481
parser.add_argument("--count-commits", action='store_true', help="Enable commit counting in the analysis.")
402482
args = parser.parse_args()
403483

@@ -407,8 +487,13 @@ def main():
407487
total_members = get_total_members()
408488
visible_members = len(get_members())
409489

490+
# Compute only employees / subscontractors / affiliates addresses
491+
allowed_domains = ["@orange.com", "@sofrecom.com", "@groupeonepoint.com", "@sciencespo.fr", "@inria.fr"]
492+
# Exclude some repositories if they pollute results
493+
forbidden_repos = [ "Orange-OpenSource/linux"]
494+
410495
# Analyze repositories for the specified year
411-
analysis = analyze_repositories(repos, args.year, args.count_commits)
496+
analysis = analyze_repositories(repos, args.year, args.count_commits, allowed_domains, forbidden_repos)
412497

413498
# Estimate the number of private members
414499
private_members_count = estimate_private_members(total_members, visible_members)

0 commit comments

Comments
 (0)