-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchangelog_getter.py
executable file
·159 lines (148 loc) · 5.45 KB
/
changelog_getter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
import argparse
import json
import urllib.parse
import urllib.request
from os.path import join
from pathlib import Path
from sys import stdin
from time import sleep
from typing import Iterable, List, Optional
verbosity = 0
target_files = [
"CHANGELOG",
"CHANGES",
"HISTORY",
"RELEASES",
"NEWS",
] # default, todo allow override
graphql_root_files_query = """
query GetFilesQuery($owner: String!, $repo: String!) {
repository(owner: $owner, name: $repo,) {
object(expression: "HEAD:") {
... on Tree {
entries {
name
type
}
}
}
}
}
"""
# want to play around with above? Try https://docs.github.com/en/graphql/overview/explorer
def get_root_file_names(org_or_user: str, repo: str, token: str) -> List[str]:
variables = {"owner": org_or_user, "repo": repo}
payload = {"query": graphql_root_files_query, "variables": variables}
headers = {"Authorization": "Bearer " + token}
req = urllib.request.Request(
"https://api.github.com/graphql",
method="POST",
headers=headers,
data=json.dumps(payload).encode("utf-8"),
)
with urllib.request.urlopen(req) as u:
response = json.load(u)
if "errors" in response:
if response["errors"][0].get("type", "") == "NOT_FOUND":
return []
else:
raise Exception(response["errors"][0]["message"])
entries = response["data"]["repository"]["object"]["entries"]
# "blobs" are files and we only want files
return [entry["name"] for entry in entries if entry["type"] == "blob"]
def get_urls(urls: Iterable[str], token: str, output_dir: Optional[str] = None):
"""yields a changelog link for each repo link in urls
Args:
urls: any iterable yielding repo links. Ex: ['https://github.com/15five/changelog-scraper']
token: github token
output_dir: dir to write changelog files to. Defaults to None.
Yields:
str: changelog url
"""
for url in urls:
# url ex: https://github.com/boto/boto3/
# clean url
url = url.strip().rstrip("/")
if len(url) == 0:
continue
if url.rfind(" unknown") > -1:
# package requirement parser was unable to find vcs link
# so we pass unknown through verbatim
yield url
continue
parsed_url = urllib.parse.urlparse(url)
# we don't care about anything after repo name
# ex: "/boto/boto3/blob/main/readme.md" -> ["","boto","boto3"]
seperated_path = parsed_url.path.split("/")[:3]
new_path = "/".join(seperated_path)
url = urllib.parse.urlunparse(
(
parsed_url.scheme,
parsed_url.netloc,
new_path,
parsed_url.params,
parsed_url.query,
parsed_url.fragment,
)
)
changelog_found = False
# for getting file contents if changelog found
raw_url = url.replace("github.com", "raw.githubusercontent.com")
org_or_user = seperated_path[1] # ex: boto
repo = seperated_path[2] # ex: boto3
root_file_names = get_root_file_names(
org_or_user=org_or_user,
repo=repo,
token=token,
)
if verbosity > 0:
print("searching through files: " + str(root_file_names))
for file_name in root_file_names:
upper_case_name = file_name.upper()
for target_file in target_files:
if upper_case_name.startswith(target_file):
changelog_found = True
raw_url_full = f"{raw_url}/HEAD/{file_name}"
if output_dir:
out_dir_full = join(output_dir, org_or_user, repo)
# urlretrieve requires path to already exist
Path(out_dir_full).mkdir(parents=True, exist_ok=True)
sleep(1) # be nice to API
urllib.request.urlretrieve(
raw_url_full, join(out_dir_full, "CHANGELOG.md")
)
yield raw_url_full
break
if changelog_found:
break
if not changelog_found:
yield f"no changelog found for {url}"
sleep(1) # be nice to API
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.description = "Prints link to changelog given a github repo link, or optionally downloads the changelog for you"
parser.add_argument(
"filepath",
nargs="?",
type=argparse.FileType("r"),
default=stdin,
help="Path to a file with a newline-seperated list of github repo url's. Defaults to stdin.",
)
parser.add_argument("-t", "--token", help="GitHub API Token", type=str)
parser.add_argument(
"-o",
"--outputdirectory",
help="If specified downloads changelogs to this dir in the form dir/owner/repo/CHANGELOG.md",
type=str,
)
parser.add_argument("-v", "--verbosity", action="count", default=0)
parsed_args = parser.parse_args()
f = parsed_args.filepath # argparse actually turns filepath into a file for us :D
verbosity = parsed_args.verbosity
[
print(url)
for url in get_urls(
f, token=parsed_args.token, output_dir=parsed_args.outputdirectory
)
]