From 5c2f7dfae423c50947b82354f3e38ec89147aa92 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 08:59:54 -0400 Subject: [PATCH 1/9] discos request timeout --- llm_github/core.py | 393 ++++++++++++++++++++++++++++++++++++++++++++ llm_github/foo.py | 17 -- local/.env.template | 2 +- poetry.lock | 104 +++++++++++- pyproject.toml | 2 + tests/test_foo.py | 2 +- 6 files changed, 500 insertions(+), 20 deletions(-) create mode 100644 llm_github/core.py delete mode 100644 llm_github/foo.py diff --git a/llm_github/core.py b/llm_github/core.py new file mode 100644 index 0000000..35a14ea --- /dev/null +++ b/llm_github/core.py @@ -0,0 +1,393 @@ +import json +import os +import time +from typing import Dict, List, Optional + +import requests +from dotenv import load_dotenv +from requests_cache import CachedSession, SQLiteCache + +REQUESTS_TIMEOUT = 10 # Timeout in seconds for requests + +# Default fields to be dropped from responses +DEFAULT_DROPPED_FIELDS = [ + "_links", + "base", + "comments_url", + "commits_url", + "diff_url", + "events_url", + "head", + "html_url", + "labels_url", + "locked", + "merge_commit_sha", + "node_id", + "patch_url", + "repository_url", + "review_comment_url", + "review_comments_url", + "statuses_url", + "timeline_url", +] + + +class EnvironmentVariableError(Exception): + """Exception raised for errors in the environment variables.""" + + def __init__(self, variable, message="is not set in the environment."): + self.variable = variable + self.message = message + super().__init__(f"{variable} {message}") + + +# Load environment variables from .env file +load_dotenv(dotenv_path="../local/.env", verbose=True) + +# Global access token for GitHub API +global_token = os.getenv("GITHUB_TOKEN") +if not global_token: + raise EnvironmentVariableError("GITHUB_TOKEN") +print("Token loaded successfully.") + +# Set up cache with SQLite backend +session = CachedSession( + cache_name="llm-github-cache", + backend=SQLiteCache("llm-github.sqlite", timeout=86400), # Cache expires after 24 hours +) + + +def get_rate_limit(token: str) -> Dict[str, int]: + """Fetch current rate limit status from GitHub API.""" + headers = {"Authorization": f"token {token}"} + response = session.get("https://api.github.com/rate_limit", headers=headers, timeout=REQUESTS_TIMEOUT) + response.raise_for_status() # Raises HTTPError for bad requests + return response.json()["rate"] + + +def wait_for_rate_limit_reset(reset_time: int) -> None: + """Wait until the rate limit reset time.""" + wait_time = reset_time - int(time.time()) + 10 # Adding 10 seconds to ensure the reset has occurred + print(f"Rate limit exceeded. Waiting for {wait_time} seconds.") + time.sleep(wait_time) + + +def remove_keys_from_dict(data: Dict, keys_to_remove: List[str]) -> Dict: + """Remove specified keys from a dictionary.""" + return {key: value for key, value in data.items() if key not in keys_to_remove} + + +def write_json_to_file(json_object: List[Dict], filename: str) -> None: + """Save data to a JSON file.""" + with open(filename, "w", encoding="utf-8") as f: + json.dump(json_object, f, ensure_ascii=False, indent=4) + print(f"Data saved to {filename}") + + +def handle_response_errors(response: requests.Response) -> None: + """Handle HTTP errors from a response.""" + if response.status_code == 404: + print("Resource not found. Check the requested resource or permissions.") + elif response.status_code == 403: + print("Access forbidden. Ensure token has the required scopes or check for rate limits.") + elif response.status_code == 401: + print("Unauthorized. Check if the token is valid or expired.") + else: + print(f"Failed to fetch data. Status code: {response.status_code}") + print("Error message:", response.text) + + +def github_token_check(token: str) -> Optional[Dict]: + """Validate the GitHub token by fetching user profile.""" + headers = {"Authorization": f"token {token}"} + response = session.get("https://api.github.com/user", headers=headers, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + print("Token is valid. User data retrieved successfully.") + return response.json() + print(f"Failed to authenticate. Status code: {response.status_code}") + return None + + +def list_user_orgs(token: str) -> Optional[List[Dict]]: + """List all organizations the user is a member of.""" + rate_limit = get_rate_limit(token) + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + headers = {"Authorization": f"token {token}"} + response = session.get("https://api.github.com/user/orgs", headers=headers, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + print("Organizations retrieved successfully.") + return response.json() + handle_response_errors(response) + return None + + +def get_repos(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all repositories for a given organization.""" + rate_limit = get_rate_limit(token) + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + repos = [] + url = f"https://api.github.com/orgs/{org}/repos" + headers = {"Authorization": f"token {token}"} + while url: + response = session.get(url, headers=headers, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + repos.extend(response.json()) + url = response.links.get("next", {}).get("url") + else: + handle_response_errors(response) + return None + return repos + + +def fetch_issues(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all issues from all repositories in an organization, handling pagination and rate limits.""" + issues = [] + repos = get_repos(org, token) + if not repos: + print("No repositories found or failed to fetch repositories.") + return None + + for repo in repos: + # Ensure the URL is constructed to fetch all issues (not just open ones) + url = repo["issues_url"].replace("{/number}", "?state=all") + while url: + rate_limit = get_rate_limit(token) # Check rate limit before each request + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + + response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + issues.extend(response.json()) + links = response.links + url = links["next"]["url"] if "next" in links else None + else: + print(f"Failed to fetch issues for {repo['name']}. Status code: {response.status_code}") + print("Error message:", response.text) + return None + return issues + + +def sanitize_user_data(data: Dict) -> Dict: + """Recursively sanitize user data to keep only the user 'login'.""" + if isinstance(data, dict): + if "login" in data and set(data.keys()) - {"login"}: + return {"login": data["login"]} + else: + return {key: sanitize_user_data(value) for key, value in data.items()} + elif isinstance(data, list): + return [sanitize_user_data(item) for item in data] + return data + + +def remove_empty_values(data: Dict) -> Dict: + """Recursively remove keys with empty values from a dictionary or list.""" + if isinstance(data, dict): + return {k: remove_empty_values(v) for k, v in data.items() if v or isinstance(v, bool)} + elif isinstance(data, list): + return [remove_empty_values(item) for item in data if item or isinstance(item, bool)] + return data + + +def process_issues(issues: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of issues to sanitize user information and remove empty values.""" + processed_issues = [] + for issue in issues: + sanitized_issue = sanitize_user_data(issue) + cleaned_issue = remove_empty_values(sanitized_issue) + final_issue = remove_keys_from_dict(cleaned_issue, keys_to_remove) + processed_issues.append(final_issue) + return processed_issues + + +def fetch_pull_requests(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all pull requests from all repositories in an organization, handling pagination and rate limits.""" + pull_requests = [] + repos = get_repos(org, token) + if not repos: + print("No repositories found or failed to fetch repositories.") + return None + + for repo in repos: + url = f"{repo['url']}/pulls?state=all" + while url: + rate_limit = get_rate_limit(token) # Check rate limit before each request + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + + response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + pull_requests.extend(response.json()) + links = response.links + url = links["next"]["url"] if "next" in links else None + else: + print(f"Failed to fetch pull requests for {repo['name']}. Status code: {response.status_code}") + print("Error message:", response.text) + return None + return pull_requests + + +def process_pull_requests(pull_requests: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of pull requests to sanitize user information and remove empty values.""" + processed_pull_requests = [] + for pr in pull_requests: + sanitized_pr = sanitize_user_data(pr) + cleaned_pr = remove_empty_values(sanitized_pr) + final_pr = remove_keys_from_dict(cleaned_pr, keys_to_remove) + processed_pull_requests.append(final_pr) + return processed_pull_requests + + +def fetch_all_comments(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all comments from all repositories in an organization, distinguishing between issue and PR comments, while handling pagination and rate limits.""" + all_comments = [] + repos = get_repos(org, token) + if not repos: + print("No repositories found or failed to fetch repositories.") + return None + + for repo in repos: + url = f"{repo['url']}/issues/comments?per_page=100" # Adjusting per_page to fetch more comments per request if needed + while url: + rate_limit = get_rate_limit(token) # Check rate limit before each request + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + + response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + comments = response.json() + for comment in comments: + if "pull_request" in comment: + comment["type"] = "pull_request" + else: + comment["type"] = "issue" + all_comments.extend(comments) + links = response.links + url = links["next"]["url"] if "next" in links else None + else: + print(f"Failed to fetch comments for {repo['name']}. Status code: {response.status_code}") + print("Error message:", response.text) + return None + return all_comments + + +def process_comments(comments: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of comments to sanitize user information and remove empty values.""" + processed_comments = [] + for comment in comments: + sanitized_comment = sanitize_user_data(comment) + cleaned_comment = remove_empty_values(sanitized_comment) + final_comment = remove_keys_from_dict(cleaned_comment, keys_to_remove) + processed_comments.append(final_comment) + return processed_comments + + +def fetch_all_discussions(org: str, token: str) -> Optional[List[Dict]]: + """Fetch discussions from all repositories in the specified organization.""" + all_discussions = [] + repos = get_repos(org, token) + if repos: + for repo in repos: + repo_name = repo["name"] if isinstance(repo, dict) else repo + print(f"Fetching discussions for repository: {repo_name}") + discussions = fetch_discussions_graphql(org, repo_name, token) + if discussions: + all_discussions.extend(discussions) + else: + print(f"No discussions found or an error occurred for repository: {repo_name}") + return all_discussions + + +def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[Dict]]: + """Fetch discussions using GitHub's GraphQL API.""" + url = "https://api.github.com/graphql" + headers = {"Authorization": f"Bearer {token}"} + query = """ + query FetchDiscussions($org: String!, $repo: String!) { + repository(owner: $org, name: $repo) { + discussions(first: 100) { + nodes { + number + title + url + bodyText + createdAt + updatedAt + author { + login + } + labels(first: 10) { + nodes { + name + description + } + } + } + } + } + } + """ + variables = {"org": org, "repo": repo} + # Added a timeout of 10 seconds + response = requests.post(url, json={"query": query, "variables": variables}, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if "errors" in data: + print(f"GraphQL Errors: {json.dumps(data['errors'], indent=2)}") + return data.get("data", {}).get("repository", {}).get("discussions", {}).get("nodes", []) + print(f"Failed to fetch discussions. Status code: {response.status_code}") + print("Response: ", response.text) + return None + + +def process_discussions(discussions: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of discussions to sanitize user information, remove empty values, and remove specified keys.""" + processed_discussions = [] + for discussion in discussions: + sanitized_discussion = sanitize_user_data(discussion) + cleaned_discussion = remove_empty_values(sanitized_discussion) + final_discussion = remove_keys_from_dict(cleaned_discussion, keys_to_remove) + processed_discussions.append(final_discussion) + return processed_discussions + + +# Example usage and other utility functions could follow +# Example usage +user_data = github_token_check(global_token) +orgs = list_user_orgs(global_token) + +# turbomam: Resource not found. This could be due to incorrect organization name or insufficient access permissions. +# Error message: {"message":"Not Found","documentation_url":"https://docs.github.com/rest/repos/repos#list-organization-repositories","status":"404"} + +# microbiomedata: Access forbidden. Check if your token has the required scopes or if there's a rate limit issue. +# Error message: {"message":"`microbiomedata` forbids access via a personal access token (classic). Please use a GitHub App, OAuth App, or a personal access token with fine-grained permissions.","documentation_url":"https://docs.github.com/rest/repos/repos#list-organization-repositories","status":"403"} + +# works: berkeleybop + +org_name = "microbiomedata" + +print("FETCHING REPOS") +repos = get_repos(org_name, global_token) +write_json_to_file(repos, f"{org_name}_repos.json") + +print("FETCHING ISSUES") +org_issues = fetch_issues(org_name, global_token) +sanitized_issues = process_issues(org_issues, DEFAULT_DROPPED_FIELDS) +write_json_to_file(sanitized_issues, f"{org_name}_issues.json") + +print("FETCHING PRs") +pull_requests = fetch_pull_requests(org_name, global_token) +processed_pull_requests = process_pull_requests(pull_requests, DEFAULT_DROPPED_FIELDS) +write_json_to_file(processed_pull_requests, f"{org_name}_prs.json") + +print("FETCHING COMMENTS") +comments = fetch_all_comments(org_name, global_token) +processed_comments = process_comments(comments, DEFAULT_DROPPED_FIELDS) +write_json_to_file(processed_comments, f"{org_name}_comments.json") + +print("FETCHING DISCUSSIONS") +all_discussions = fetch_all_discussions(org_name, global_token) +processed_discussions = process_discussions(all_discussions, DEFAULT_DROPPED_FIELDS) +print(f"Total discussions fetched from all repositories: {len(processed_discussions)}") +write_json_to_file(processed_discussions, f"{org_name}_discussions.json") diff --git a/llm_github/foo.py b/llm_github/foo.py deleted file mode 100644 index 8b7396d..0000000 --- a/llm_github/foo.py +++ /dev/null @@ -1,17 +0,0 @@ -def foo(bar: str) -> str: - """Summary line. - - Extended description of function. - - Args: - bar: Description of input argument. - - Returns: - Description of return value - """ - - return bar - - -if __name__ == "__main__": # pragma: no cover - pass diff --git a/local/.env.template b/local/.env.template index 74d0a43..3b926cd 100644 --- a/local/.env.template +++ b/local/.env.template @@ -1 +1 @@ -foo=bar +GITHUB_TOKEN= diff --git a/poetry.lock b/poetry.lock index 1586a71..a940bf6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -15,6 +15,25 @@ files = [ six = ">=1.6.1,<2.0" wheel = ">=0.23.0,<1.0" +[[package]] +name = "attrs" +version = "23.2.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"}, + {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, +] + +[package.extras] +cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] +dev = ["attrs[tests]", "pre-commit"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] +tests = ["attrs[tests-no-zope]", "zope-interface"] +tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"] +tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"] + [[package]] name = "babel" version = "2.15.0" @@ -43,6 +62,31 @@ files = [ {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"}, ] +[[package]] +name = "cattrs" +version = "23.2.3" +description = "Composable complex class support for attrs and dataclasses." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cattrs-23.2.3-py3-none-any.whl", hash = "sha256:0341994d94971052e9ee70662542699a3162ea1e0c62f7ce1b4a57f563685108"}, + {file = "cattrs-23.2.3.tar.gz", hash = "sha256:a934090d95abaa9e911dac357e3a8699e0b4b14f8529bcc7d2b1ad9d51672b9f"}, +] + +[package.dependencies] +attrs = ">=23.1.0" +exceptiongroup = {version = ">=1.1.1", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.1.0,<4.6.3 || >4.6.3", markers = "python_version < \"3.11\""} + +[package.extras] +bson = ["pymongo (>=4.4.0)"] +cbor2 = ["cbor2 (>=5.4.6)"] +msgpack = ["msgpack (>=1.0.5)"] +orjson = ["orjson (>=3.9.2)"] +pyyaml = ["pyyaml (>=6.0)"] +tomlkit = ["tomlkit (>=0.11.8)"] +ujson = ["ujson (>=5.7.0)"] + [[package]] name = "certifi" version = "2024.6.2" @@ -929,6 +973,20 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.1" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, + {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2024.1" @@ -1123,6 +1181,36 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-cache" +version = "1.2.1" +description = "A persistent cache for python requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests_cache-1.2.1-py3-none-any.whl", hash = "sha256:1285151cddf5331067baa82598afe2d47c7495a1334bfe7a7d329b43e9fd3603"}, + {file = "requests_cache-1.2.1.tar.gz", hash = "sha256:68abc986fdc5b8d0911318fbb5f7c80eebcd4d01bfacc6685ecf8876052511d1"}, +] + +[package.dependencies] +attrs = ">=21.2" +cattrs = ">=22.2" +platformdirs = ">=2.5" +requests = ">=2.22" +url-normalize = ">=1.4" +urllib3 = ">=1.25.5" + +[package.extras] +all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=6.0.1)", "redis (>=3)", "ujson (>=5.4)"] +bson = ["bson (>=0.5)"] +docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.9)"] +dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"] +json = ["ujson (>=5.4)"] +mongodb = ["pymongo (>=3)"] +redis = ["redis (>=3)"] +security = ["itsdangerous (>=2.0)"] +yaml = ["pyyaml (>=6.0.1)"] + [[package]] name = "six" version = "1.16.0" @@ -1183,6 +1271,20 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "url-normalize" +version = "1.4.3" +description = "URL normalization for Python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"}, + {file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "urllib3" version = "2.2.2" @@ -1296,4 +1398,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "a3b14f6f7cdd13dce3b5a26933ebce280805aba59061c1f199e1e5f2ad527883" +content-hash = "d86f001781d611b808f7aabbf1e6125bfa210513a12386cabeadb7c5db8db447" diff --git a/pyproject.toml b/pyproject.toml index 09d370a..7077635 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,8 @@ packages = [ [tool.poetry.dependencies] python = ">=3.8,<4.0" +python-dotenv = "^1.0.1" +requests-cache = "^1.2.1" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" diff --git a/tests/test_foo.py b/tests/test_foo.py index 42c9b38..5ec9206 100644 --- a/tests/test_foo.py +++ b/tests/test_foo.py @@ -1,4 +1,4 @@ -from llm_github.foo import foo +from llm_github.core import foo def test_foo(): From 3bc21d047b423194b2aed207fdbd7481f0210aa0 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 10:04:08 -0400 Subject: [PATCH 2/9] noqa mock token in tests --- llm_github/core.py | 102 +++++++++++------------------------------- llm_github/execute.py | 84 ++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- tests/test_core.py | 17 +++++++ tests/test_foo.py | 5 --- 5 files changed, 127 insertions(+), 83 deletions(-) create mode 100644 llm_github/execute.py create mode 100644 tests/test_core.py delete mode 100644 tests/test_foo.py diff --git a/llm_github/core.py b/llm_github/core.py index 35a14ea..5ad7c79 100644 --- a/llm_github/core.py +++ b/llm_github/core.py @@ -1,11 +1,9 @@ import json -import os import time from typing import Dict, List, Optional import requests -from dotenv import load_dotenv -from requests_cache import CachedSession, SQLiteCache +from requests_cache import CachedSession REQUESTS_TIMEOUT = 10 # Timeout in seconds for requests @@ -41,23 +39,12 @@ def __init__(self, variable, message="is not set in the environment."): super().__init__(f"{variable} {message}") -# Load environment variables from .env file -load_dotenv(dotenv_path="../local/.env", verbose=True) +def return_verbatim(input_string: str) -> str: + """Return the input string.""" + return input_string -# Global access token for GitHub API -global_token = os.getenv("GITHUB_TOKEN") -if not global_token: - raise EnvironmentVariableError("GITHUB_TOKEN") -print("Token loaded successfully.") -# Set up cache with SQLite backend -session = CachedSession( - cache_name="llm-github-cache", - backend=SQLiteCache("llm-github.sqlite", timeout=86400), # Cache expires after 24 hours -) - - -def get_rate_limit(token: str) -> Dict[str, int]: +def get_rate_limit(token: str, session: CachedSession) -> Dict[str, int]: """Fetch current rate limit status from GitHub API.""" headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/rate_limit", headers=headers, timeout=REQUESTS_TIMEOUT) @@ -97,7 +84,7 @@ def handle_response_errors(response: requests.Response) -> None: print("Error message:", response.text) -def github_token_check(token: str) -> Optional[Dict]: +def github_token_check(token: str, session: CachedSession) -> Optional[Dict]: """Validate the GitHub token by fetching user profile.""" headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/user", headers=headers, timeout=REQUESTS_TIMEOUT) @@ -108,9 +95,9 @@ def github_token_check(token: str) -> Optional[Dict]: return None -def list_user_orgs(token: str) -> Optional[List[Dict]]: +def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict]]: """List all organizations the user is a member of.""" - rate_limit = get_rate_limit(token) + rate_limit = get_rate_limit(token, session) if rate_limit["remaining"] == 0: wait_for_rate_limit_reset(rate_limit["reset"]) headers = {"Authorization": f"token {token}"} @@ -122,9 +109,9 @@ def list_user_orgs(token: str) -> Optional[List[Dict]]: return None -def get_repos(org: str, token: str) -> Optional[List[Dict]]: +def get_repos(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: """Fetch all repositories for a given organization.""" - rate_limit = get_rate_limit(token) + rate_limit = get_rate_limit(token, session) if rate_limit["remaining"] == 0: wait_for_rate_limit_reset(rate_limit["reset"]) repos = [] @@ -141,10 +128,10 @@ def get_repos(org: str, token: str) -> Optional[List[Dict]]: return repos -def fetch_issues(org: str, token: str) -> Optional[List[Dict]]: +def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: """Fetch all issues from all repositories in an organization, handling pagination and rate limits.""" issues = [] - repos = get_repos(org, token) + repos = get_repos(org, token, session) if not repos: print("No repositories found or failed to fetch repositories.") return None @@ -153,7 +140,7 @@ def fetch_issues(org: str, token: str) -> Optional[List[Dict]]: # Ensure the URL is constructed to fetch all issues (not just open ones) url = repo["issues_url"].replace("{/number}", "?state=all") while url: - rate_limit = get_rate_limit(token) # Check rate limit before each request + rate_limit = get_rate_limit(token, session) # Check rate limit before each request if rate_limit["remaining"] == 0: wait_for_rate_limit_reset(rate_limit["reset"]) @@ -201,10 +188,10 @@ def process_issues(issues: List[Dict], keys_to_remove: List[str]) -> List[Dict]: return processed_issues -def fetch_pull_requests(org: str, token: str) -> Optional[List[Dict]]: +def fetch_pull_requests(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: """Fetch all pull requests from all repositories in an organization, handling pagination and rate limits.""" pull_requests = [] - repos = get_repos(org, token) + repos = get_repos(org, token, session) if not repos: print("No repositories found or failed to fetch repositories.") return None @@ -212,7 +199,7 @@ def fetch_pull_requests(org: str, token: str) -> Optional[List[Dict]]: for repo in repos: url = f"{repo['url']}/pulls?state=all" while url: - rate_limit = get_rate_limit(token) # Check rate limit before each request + rate_limit = get_rate_limit(token, session) # Check rate limit before each request if rate_limit["remaining"] == 0: wait_for_rate_limit_reset(rate_limit["reset"]) @@ -239,18 +226,20 @@ def process_pull_requests(pull_requests: List[Dict], keys_to_remove: List[str]) return processed_pull_requests -def fetch_all_comments(org: str, token: str) -> Optional[List[Dict]]: - """Fetch all comments from all repositories in an organization, distinguishing between issue and PR comments, while handling pagination and rate limits.""" +def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: + """Fetch all comments from all repositories in an organization, + distinguishing between issue and PR comments, while handling pagination and rate limits.""" all_comments = [] - repos = get_repos(org, token) + repos = get_repos(org, token, session) if not repos: print("No repositories found or failed to fetch repositories.") return None for repo in repos: - url = f"{repo['url']}/issues/comments?per_page=100" # Adjusting per_page to fetch more comments per request if needed + # Adjusting per_page to fetch more comments per request if needed + url = f"{repo['url']}/issues/comments?per_page=100" while url: - rate_limit = get_rate_limit(token) # Check rate limit before each request + rate_limit = get_rate_limit(token, session) # Check rate limit before each request if rate_limit["remaining"] == 0: wait_for_rate_limit_reset(rate_limit["reset"]) @@ -283,10 +272,10 @@ def process_comments(comments: List[Dict], keys_to_remove: List[str]) -> List[Di return processed_comments -def fetch_all_discussions(org: str, token: str) -> Optional[List[Dict]]: +def fetch_all_discussions(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: """Fetch discussions from all repositories in the specified organization.""" all_discussions = [] - repos = get_repos(org, token) + repos = get_repos(org, token, session) if repos: for repo in repos: repo_name = repo["name"] if isinstance(repo, dict) else repo @@ -350,44 +339,3 @@ def process_discussions(discussions: List[Dict], keys_to_remove: List[str]) -> L final_discussion = remove_keys_from_dict(cleaned_discussion, keys_to_remove) processed_discussions.append(final_discussion) return processed_discussions - - -# Example usage and other utility functions could follow -# Example usage -user_data = github_token_check(global_token) -orgs = list_user_orgs(global_token) - -# turbomam: Resource not found. This could be due to incorrect organization name or insufficient access permissions. -# Error message: {"message":"Not Found","documentation_url":"https://docs.github.com/rest/repos/repos#list-organization-repositories","status":"404"} - -# microbiomedata: Access forbidden. Check if your token has the required scopes or if there's a rate limit issue. -# Error message: {"message":"`microbiomedata` forbids access via a personal access token (classic). Please use a GitHub App, OAuth App, or a personal access token with fine-grained permissions.","documentation_url":"https://docs.github.com/rest/repos/repos#list-organization-repositories","status":"403"} - -# works: berkeleybop - -org_name = "microbiomedata" - -print("FETCHING REPOS") -repos = get_repos(org_name, global_token) -write_json_to_file(repos, f"{org_name}_repos.json") - -print("FETCHING ISSUES") -org_issues = fetch_issues(org_name, global_token) -sanitized_issues = process_issues(org_issues, DEFAULT_DROPPED_FIELDS) -write_json_to_file(sanitized_issues, f"{org_name}_issues.json") - -print("FETCHING PRs") -pull_requests = fetch_pull_requests(org_name, global_token) -processed_pull_requests = process_pull_requests(pull_requests, DEFAULT_DROPPED_FIELDS) -write_json_to_file(processed_pull_requests, f"{org_name}_prs.json") - -print("FETCHING COMMENTS") -comments = fetch_all_comments(org_name, global_token) -processed_comments = process_comments(comments, DEFAULT_DROPPED_FIELDS) -write_json_to_file(processed_comments, f"{org_name}_comments.json") - -print("FETCHING DISCUSSIONS") -all_discussions = fetch_all_discussions(org_name, global_token) -processed_discussions = process_discussions(all_discussions, DEFAULT_DROPPED_FIELDS) -print(f"Total discussions fetched from all repositories: {len(processed_discussions)}") -write_json_to_file(processed_discussions, f"{org_name}_discussions.json") diff --git a/llm_github/execute.py b/llm_github/execute.py new file mode 100644 index 0000000..70ab80e --- /dev/null +++ b/llm_github/execute.py @@ -0,0 +1,84 @@ +import os + +from core import ( + DEFAULT_DROPPED_FIELDS, + CachedSession, + EnvironmentVariableError, + fetch_all_comments, + fetch_all_discussions, + fetch_issues, + fetch_pull_requests, + get_repos, + github_token_check, + list_user_orgs, + process_comments, + process_discussions, + process_issues, + process_pull_requests, + write_json_to_file, +) +from dotenv import load_dotenv +from requests_cache.backends.sqlite import SQLiteCache + +# Load environment variables from .env file +load_dotenv(dotenv_path="local/.env", verbose=True) + +# Global access token for GitHub API +global_token = os.environ["GITHUB_TOKEN"] +if not global_token: + raise EnvironmentVariableError("GITHUB_TOKEN") +print("Token loaded successfully.") + +# Set up cache with SQLite backend +session = CachedSession( + cache_name="llm-github-cache", + backend=SQLiteCache("llm-github.sqlite", timeout=86400), # Cache expires after 24 hours +) + +user_data = github_token_check(global_token, session=session) +orgs = list_user_orgs(global_token, session=session) + +# turbomam: Resource not found. This could be due to incorrect organization name or insufficient access permissions. +# Error message: +# { +# "message": "Not Found", +# "documentation_url": "https://docs.github.com/rest/repos/repos#list-organization-repositories", +# "status": "404" +# } + +# microbiomedata: Access forbidden. Check if your token has the required scopes or if there's a rate limit issue. +# Error message: +# { +# "message": "`microbiomedata` forbids access via a personal access token (classic). Please use a GitHub App, OAuth App, or a personal access token with fine-grained permissions.", +# "documentation_url": "https://docs.github.com/rest/repos/repos#list-organization-repositories", +# "status": "403" +# } + +# works: berkeleybop + +org_name = "microbiomedata" + +print("FETCHING REPOS") +repos = get_repos(org_name, global_token, session=session) +write_json_to_file(repos, f"{org_name}_repos.json") + +print("FETCHING ISSUES") +org_issues = fetch_issues(org_name, global_token, session=session) +sanitized_issues = process_issues(org_issues, DEFAULT_DROPPED_FIELDS) +write_json_to_file(sanitized_issues, f"{org_name}_issues.json") + +print("FETCHING PRs") +pull_requests = fetch_pull_requests(org_name, global_token, session=session) +processed_pull_requests = process_pull_requests(pull_requests, DEFAULT_DROPPED_FIELDS) +write_json_to_file(processed_pull_requests, f"{org_name}_prs.json") + +print("FETCHING COMMENTS") +comments = fetch_all_comments(org_name, global_token, session=session) +processed_comments = process_comments(comments, DEFAULT_DROPPED_FIELDS) +write_json_to_file(processed_comments, f"{org_name}_comments.json") + +print("FETCHING DISCUSSIONS") +all_discussions = fetch_all_discussions(org_name, global_token, session=session) +processed_discussions = process_discussions(all_discussions, DEFAULT_DROPPED_FIELDS) +print(f"Total discussions fetched from all repositories: {len(processed_discussions)}") +write_json_to_file(processed_discussions, f"{org_name}_discussions.json") diff --git a/pyproject.toml b/pyproject.toml index 7077635..c5e93e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "llm_github" version = "0.0.1" description = "Tools for extracting knowledge from GitHub issues, PR comments, etc." -authors = ["Mark Andrew Miller "] +authors = ["Mark Andrew Miller "] repository = "https://github.com/turbomam/llm-github" documentation = "https://turbomam.github.io/llm-github/" readme = "README.md" diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 0000000..88d95a8 --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,17 @@ +from unittest.mock import Mock + +from llm_github.core import get_rate_limit, return_verbatim + + +def test_return_verbatim(): + assert return_verbatim("foo") == "foo" + + +def test_get_rate_limit(): + mock_session = Mock() + mock_session.get.return_value.status_code = 200 + mock_session.get.return_value.json.return_value = {"rate": {"limit": 5000, "remaining": 4999}} + + token = "fake_token" # noqa: S105 + result = get_rate_limit(token, mock_session) + assert result == {"limit": 5000, "remaining": 4999} diff --git a/tests/test_foo.py b/tests/test_foo.py deleted file mode 100644 index 5ec9206..0000000 --- a/tests/test_foo.py +++ /dev/null @@ -1,5 +0,0 @@ -from llm_github.core import foo - - -def test_foo(): - assert foo("foo") == "foo" From 543719d75839297fa302c93a78ffdca799f23811 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 10:11:15 -0400 Subject: [PATCH 3/9] Apply pre-commit fixes --- .cruft.json | 46 ++++++++++----------- .github/actions/setup-poetry-env/action.yml | 8 ++-- .github/workflows/main.yml | 2 +- .github/workflows/on-release-main.yml | 4 +- LICENSE | 1 - mkdocs.yml | 6 +-- 6 files changed, 32 insertions(+), 35 deletions(-) diff --git a/.cruft.json b/.cruft.json index 8377fc9..d9b6d49 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,25 +1,25 @@ { - "template": "https://github.com/fpgmaas/cookiecutter-poetry.git", - "commit": "f448c9c6407c799f6b81b8e310608cb841e98d15", - "checkout": null, - "context": { - "cookiecutter": { - "author": "Mark Andrew Miller", - "email": "mamillerpa@gmail.com", - "author_github_handle": "turbomam", - "project_name": "llm-github", - "project_slug": "llm_github", - "project_description": "Tools for extracting knowledge from GitHub issues, PR comments, etc.", - "include_github_actions": "y", - "publish_to": "pypi", - "deptry": "y", - "mkdocs": "y", - "codecov": "y", - "dockerfile": "y", - "devcontainer": "n", - "open_source_license": "MIT license", - "_template": "https://github.com/fpgmaas/cookiecutter-poetry.git" - } - }, - "directory": null + "template": "https://github.com/fpgmaas/cookiecutter-poetry.git", + "commit": "f448c9c6407c799f6b81b8e310608cb841e98d15", + "checkout": null, + "context": { + "cookiecutter": { + "author": "Mark Andrew Miller", + "email": "mamillerpa@gmail.com", + "author_github_handle": "turbomam", + "project_name": "llm-github", + "project_slug": "llm_github", + "project_description": "Tools for extracting knowledge from GitHub issues, PR comments, etc.", + "include_github_actions": "y", + "publish_to": "pypi", + "deptry": "y", + "mkdocs": "y", + "codecov": "y", + "dockerfile": "y", + "devcontainer": "n", + "open_source_license": "MIT license", + "_template": "https://github.com/fpgmaas/cookiecutter-poetry.git" + } + }, + "directory": null } diff --git a/.github/actions/setup-poetry-env/action.yml b/.github/actions/setup-poetry-env/action.yml index b2cd2df..d786811 100644 --- a/.github/actions/setup-poetry-env/action.yml +++ b/.github/actions/setup-poetry-env/action.yml @@ -2,10 +2,10 @@ name: "setup-poetry-env" description: "Composite action to setup the Python and poetry environment." inputs: - python-version: - required: false - description: "The python version to use" - default: "3.11" + python-version: + required: false + description: "The python version to use" + default: "3.11" runs: using: "composite" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d193f81..679ba4a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ["3.8", "3.9", "3.10", "3.11"] fail-fast: false steps: - name: Check out diff --git a/.github/workflows/on-release-main.yml b/.github/workflows/on-release-main.yml index 85f5200..6a13b1c 100644 --- a/.github/workflows/on-release-main.yml +++ b/.github/workflows/on-release-main.yml @@ -6,7 +6,6 @@ on: branches: [main] jobs: - publish: runs-on: ubuntu-latest steps: @@ -28,7 +27,7 @@ jobs: env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} RELEASE_VERSION: ${{ steps.vars.outputs.tag }} - + deploy-docs: needs: publish runs-on: ubuntu-latest @@ -41,4 +40,3 @@ jobs: - name: Deploy documentation run: poetry run mkdocs gh-deploy --force - diff --git a/LICENSE b/LICENSE index 0da21c0..8d8fd7e 100644 --- a/LICENSE +++ b/LICENSE @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/mkdocs.yml b/mkdocs.yml index 212a5b1..c8cfd91 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,9 +15,9 @@ plugins: - mkdocstrings: handlers: python: - setup_commands: - - import sys - - sys.path.append('../') + setup_commands: + - import sys + - sys.path.append('../') theme: name: material feature: From 1375edec9377ded6b2f90966f9388096690d5e82 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 10:16:15 -0400 Subject: [PATCH 4/9] document core module not foo --- docs/modules.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules.md b/docs/modules.md index 07e4b93..faa1bf3 100644 --- a/docs/modules.md +++ b/docs/modules.md @@ -1 +1 @@ -::: llm_github.foo +::: llm_github.core From c5529814e76bd7ff2ad288f6583de95af9faed28 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 10:48:12 -0400 Subject: [PATCH 5/9] ruff edits --- .gitignore | 1 + llm_github/core.py | 56 ++++++++++++++++++------------------ llm_github/execute.py | 67 ++++++++++++++++++------------------------- poetry.lock | 16 ++++++++++- pyproject.toml | 1 + 5 files changed, 73 insertions(+), 68 deletions(-) diff --git a/.gitignore b/.gitignore index c913412..26f2208 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ +*.sqlite ### diff --git a/llm_github/core.py b/llm_github/core.py index 5ad7c79..cb4ca99 100644 --- a/llm_github/core.py +++ b/llm_github/core.py @@ -1,6 +1,6 @@ import json import time -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional import requests from requests_cache import CachedSession @@ -8,7 +8,7 @@ REQUESTS_TIMEOUT = 10 # Timeout in seconds for requests # Default fields to be dropped from responses -DEFAULT_DROPPED_FIELDS = [ +DEFAULT_DROPPED_FIELDS: List[str] = [ "_links", "base", "comments_url", @@ -33,7 +33,7 @@ class EnvironmentVariableError(Exception): """Exception raised for errors in the environment variables.""" - def __init__(self, variable, message="is not set in the environment."): + def __init__(self, variable: str, message: str = "is not set in the environment.") -> None: self.variable = variable self.message = message super().__init__(f"{variable} {message}") @@ -59,12 +59,12 @@ def wait_for_rate_limit_reset(reset_time: int) -> None: time.sleep(wait_time) -def remove_keys_from_dict(data: Dict, keys_to_remove: List[str]) -> Dict: +def remove_keys_from_dict(data: Dict[str, Any], keys_to_remove: List[str]) -> Dict[str, Any]: """Remove specified keys from a dictionary.""" return {key: value for key, value in data.items() if key not in keys_to_remove} -def write_json_to_file(json_object: List[Dict], filename: str) -> None: +def write_json_to_file(json_object: List[Dict[str, Any]], filename: str) -> None: """Save data to a JSON file.""" with open(filename, "w", encoding="utf-8") as f: json.dump(json_object, f, ensure_ascii=False, indent=4) @@ -84,7 +84,7 @@ def handle_response_errors(response: requests.Response) -> None: print("Error message:", response.text) -def github_token_check(token: str, session: CachedSession) -> Optional[Dict]: +def github_token_check(token: str, session: CachedSession) -> Optional[Dict[str, Any]]: """Validate the GitHub token by fetching user profile.""" headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/user", headers=headers, timeout=REQUESTS_TIMEOUT) @@ -95,7 +95,7 @@ def github_token_check(token: str, session: CachedSession) -> Optional[Dict]: return None -def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict]]: +def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """List all organizations the user is a member of.""" rate_limit = get_rate_limit(token, session) if rate_limit["remaining"] == 0: @@ -109,12 +109,12 @@ def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict]]: return None -def get_repos(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: +def get_repos(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch all repositories for a given organization.""" rate_limit = get_rate_limit(token, session) if rate_limit["remaining"] == 0: wait_for_rate_limit_reset(rate_limit["reset"]) - repos = [] + repos: List[Dict[str, Any]] = [] url = f"https://api.github.com/orgs/{org}/repos" headers = {"Authorization": f"token {token}"} while url: @@ -128,9 +128,9 @@ def get_repos(org: str, token: str, session: CachedSession) -> Optional[List[Dic return repos -def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: +def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch all issues from all repositories in an organization, handling pagination and rate limits.""" - issues = [] + issues: List[Dict[str, Any]] = [] repos = get_repos(org, token, session) if not repos: print("No repositories found or failed to fetch repositories.") @@ -156,7 +156,7 @@ def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[ return issues -def sanitize_user_data(data: Dict) -> Dict: +def sanitize_user_data(data: Any) -> Any: """Recursively sanitize user data to keep only the user 'login'.""" if isinstance(data, dict): if "login" in data and set(data.keys()) - {"login"}: @@ -168,7 +168,7 @@ def sanitize_user_data(data: Dict) -> Dict: return data -def remove_empty_values(data: Dict) -> Dict: +def remove_empty_values(data: Any) -> Any: """Recursively remove keys with empty values from a dictionary or list.""" if isinstance(data, dict): return {k: remove_empty_values(v) for k, v in data.items() if v or isinstance(v, bool)} @@ -177,9 +177,9 @@ def remove_empty_values(data: Dict) -> Dict: return data -def process_issues(issues: List[Dict], keys_to_remove: List[str]) -> List[Dict]: +def process_issues(issues: List[Dict[str, Any]], keys_to_remove: List[str]) -> List[Dict[str, Any]]: """Process a list of issues to sanitize user information and remove empty values.""" - processed_issues = [] + processed_issues: List[Dict[str, Any]] = [] for issue in issues: sanitized_issue = sanitize_user_data(issue) cleaned_issue = remove_empty_values(sanitized_issue) @@ -188,9 +188,9 @@ def process_issues(issues: List[Dict], keys_to_remove: List[str]) -> List[Dict]: return processed_issues -def fetch_pull_requests(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: +def fetch_pull_requests(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch all pull requests from all repositories in an organization, handling pagination and rate limits.""" - pull_requests = [] + pull_requests: List[Dict[str, Any]] = [] repos = get_repos(org, token, session) if not repos: print("No repositories found or failed to fetch repositories.") @@ -215,9 +215,9 @@ def fetch_pull_requests(org: str, token: str, session: CachedSession) -> Optiona return pull_requests -def process_pull_requests(pull_requests: List[Dict], keys_to_remove: List[str]) -> List[Dict]: +def process_pull_requests(pull_requests: List[Dict[str, Any]], keys_to_remove: List[str]) -> List[Dict[str, Any]]: """Process a list of pull requests to sanitize user information and remove empty values.""" - processed_pull_requests = [] + processed_pull_requests: List[Dict[str, Any]] = [] for pr in pull_requests: sanitized_pr = sanitize_user_data(pr) cleaned_pr = remove_empty_values(sanitized_pr) @@ -226,10 +226,10 @@ def process_pull_requests(pull_requests: List[Dict], keys_to_remove: List[str]) return processed_pull_requests -def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: +def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch all comments from all repositories in an organization, distinguishing between issue and PR comments, while handling pagination and rate limits.""" - all_comments = [] + all_comments: List[Dict[str, Any]] = [] repos = get_repos(org, token, session) if not repos: print("No repositories found or failed to fetch repositories.") @@ -261,9 +261,9 @@ def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional return all_comments -def process_comments(comments: List[Dict], keys_to_remove: List[str]) -> List[Dict]: +def process_comments(comments: List[Dict[str, Any]], keys_to_remove: List[str]) -> List[Dict[str, Any]]: """Process a list of comments to sanitize user information and remove empty values.""" - processed_comments = [] + processed_comments: List[Dict[str, Any]] = [] for comment in comments: sanitized_comment = sanitize_user_data(comment) cleaned_comment = remove_empty_values(sanitized_comment) @@ -272,9 +272,9 @@ def process_comments(comments: List[Dict], keys_to_remove: List[str]) -> List[Di return processed_comments -def fetch_all_discussions(org: str, token: str, session: CachedSession) -> Optional[List[Dict]]: +def fetch_all_discussions(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch discussions from all repositories in the specified organization.""" - all_discussions = [] + all_discussions: List[Dict[str, Any]] = [] repos = get_repos(org, token, session) if repos: for repo in repos: @@ -288,7 +288,7 @@ def fetch_all_discussions(org: str, token: str, session: CachedSession) -> Optio return all_discussions -def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[Dict]]: +def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[Dict[str, Any]]]: """Fetch discussions using GitHub's GraphQL API.""" url = "https://api.github.com/graphql" headers = {"Authorization": f"Bearer {token}"} @@ -330,9 +330,9 @@ def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[ return None -def process_discussions(discussions: List[Dict], keys_to_remove: List[str]) -> List[Dict]: +def process_discussions(discussions: List[Dict[str, Any]], keys_to_remove: List[str]) -> List[Dict[str, Any]]: """Process a list of discussions to sanitize user information, remove empty values, and remove specified keys.""" - processed_discussions = [] + processed_discussions: List[Dict[str, Any]] = [] for discussion in discussions: sanitized_discussion = sanitize_user_data(discussion) cleaned_discussion = remove_empty_values(sanitized_discussion) diff --git a/llm_github/execute.py b/llm_github/execute.py index 70ab80e..7a2ab58 100644 --- a/llm_github/execute.py +++ b/llm_github/execute.py @@ -1,8 +1,9 @@ import os +from typing import Dict, List, Optional +# Fixing import conflicts by adjusting namespace and avoiding re-importing CachedSession from core import ( DEFAULT_DROPPED_FIELDS, - CachedSession, EnvironmentVariableError, fetch_all_comments, fetch_all_discussions, @@ -18,67 +19,55 @@ write_json_to_file, ) from dotenv import load_dotenv +from requests_cache import CachedSession from requests_cache.backends.sqlite import SQLiteCache # Load environment variables from .env file load_dotenv(dotenv_path="local/.env", verbose=True) # Global access token for GitHub API -global_token = os.environ["GITHUB_TOKEN"] +global_token: str = os.getenv("GITHUB_TOKEN", "") if not global_token: raise EnvironmentVariableError("GITHUB_TOKEN") print("Token loaded successfully.") # Set up cache with SQLite backend -session = CachedSession( +session: CachedSession = CachedSession( cache_name="llm-github-cache", backend=SQLiteCache("llm-github.sqlite", timeout=86400), # Cache expires after 24 hours ) -user_data = github_token_check(global_token, session=session) -orgs = list_user_orgs(global_token, session=session) +user_data: Optional[Dict] = github_token_check(global_token, session=session) +orgs: Optional[List[Dict]] = list_user_orgs(global_token, session=session) -# turbomam: Resource not found. This could be due to incorrect organization name or insufficient access permissions. -# Error message: -# { -# "message": "Not Found", -# "documentation_url": "https://docs.github.com/rest/repos/repos#list-organization-repositories", -# "status": "404" -# } - -# microbiomedata: Access forbidden. Check if your token has the required scopes or if there's a rate limit issue. -# Error message: -# { -# "message": "`microbiomedata` forbids access via a personal access token (classic). Please use a GitHub App, OAuth App, or a personal access token with fine-grained permissions.", -# "documentation_url": "https://docs.github.com/rest/repos/repos#list-organization-repositories", -# "status": "403" -# } - -# works: berkeleybop - -org_name = "microbiomedata" +org_name: str = "microbiomedata" print("FETCHING REPOS") -repos = get_repos(org_name, global_token, session=session) -write_json_to_file(repos, f"{org_name}_repos.json") +repos: Optional[List[Dict]] = get_repos(org_name, global_token, session=session) +if repos: + write_json_to_file(repos, f"{org_name}_repos.json") print("FETCHING ISSUES") -org_issues = fetch_issues(org_name, global_token, session=session) -sanitized_issues = process_issues(org_issues, DEFAULT_DROPPED_FIELDS) -write_json_to_file(sanitized_issues, f"{org_name}_issues.json") +org_issues: Optional[List[Dict]] = fetch_issues(org_name, global_token, session=session) +if org_issues: + sanitized_issues: List[Dict] = process_issues(org_issues, DEFAULT_DROPPED_FIELDS) + write_json_to_file(sanitized_issues, f"{org_name}_issues.json") print("FETCHING PRs") -pull_requests = fetch_pull_requests(org_name, global_token, session=session) -processed_pull_requests = process_pull_requests(pull_requests, DEFAULT_DROPPED_FIELDS) -write_json_to_file(processed_pull_requests, f"{org_name}_prs.json") +pull_requests: Optional[List[Dict]] = fetch_pull_requests(org_name, global_token, session=session) +if pull_requests: + processed_pull_requests: List[Dict] = process_pull_requests(pull_requests, DEFAULT_DROPPED_FIELDS) + write_json_to_file(processed_pull_requests, f"{org_name}_prs.json") print("FETCHING COMMENTS") -comments = fetch_all_comments(org_name, global_token, session=session) -processed_comments = process_comments(comments, DEFAULT_DROPPED_FIELDS) -write_json_to_file(processed_comments, f"{org_name}_comments.json") +comments: Optional[List[Dict]] = fetch_all_comments(org_name, global_token, session=session) +if comments: + processed_comments: List[Dict] = process_comments(comments, DEFAULT_DROPPED_FIELDS) + write_json_to_file(processed_comments, f"{org_name}_comments.json") print("FETCHING DISCUSSIONS") -all_discussions = fetch_all_discussions(org_name, global_token, session=session) -processed_discussions = process_discussions(all_discussions, DEFAULT_DROPPED_FIELDS) -print(f"Total discussions fetched from all repositories: {len(processed_discussions)}") -write_json_to_file(processed_discussions, f"{org_name}_discussions.json") +all_discussions: Optional[List[Dict]] = fetch_all_discussions(org_name, global_token, session=session) +if all_discussions: + processed_discussions: List[Dict] = process_discussions(all_discussions, DEFAULT_DROPPED_FIELDS) + print(f"Total discussions fetched from all repositories: {len(processed_discussions)}") + write_json_to_file(processed_discussions, f"{org_name}_discussions.json") diff --git a/poetry.lock b/poetry.lock index a940bf6..3ec6125 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1260,6 +1260,20 @@ virtualenv = ">=20.25" docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-argparse-cli (>=1.11.1)", "sphinx-autodoc-typehints (>=1.25.2)", "sphinx-copybutton (>=0.5.2)", "sphinx-inline-tabs (>=2023.4.21)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.11)"] testing = ["build[virtualenv] (>=1.0.3)", "covdefaults (>=2.3)", "detect-test-pollution (>=1.2)", "devpi-process (>=1)", "diff-cover (>=8.0.2)", "distlib (>=0.3.8)", "flaky (>=3.7)", "hatch-vcs (>=0.4)", "hatchling (>=1.21)", "psutil (>=5.9.7)", "pytest (>=7.4.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-xdist (>=3.5)", "re-assert (>=1.1)", "time-machine (>=2.13)", "wheel (>=0.42)"] +[[package]] +name = "types-requests" +version = "2.32.0.20240622" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-requests-2.32.0.20240622.tar.gz", hash = "sha256:ed5e8a412fcc39159d6319385c009d642845f250c63902718f605cd90faade31"}, + {file = "types_requests-2.32.0.20240622-py3-none-any.whl", hash = "sha256:97bac6b54b5bd4cf91d407e62f0932a74821bc2211f22116d9ee1dd643826caf"}, +] + +[package.dependencies] +urllib3 = ">=2" + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1398,4 +1412,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "d86f001781d611b808f7aabbf1e6125bfa210513a12386cabeadb7c5db8db447" +content-hash = "20dc492ca082a6cbb83ce664ff5b6c5749e847c4370d0bda28955b2cbd9eb67d" diff --git a/pyproject.toml b/pyproject.toml index c5e93e7..4b688a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ packages = [ python = ">=3.8,<4.0" python-dotenv = "^1.0.1" requests-cache = "^1.2.1" +types-requests = "^2.32.0.20240622" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" From 307339897b3675b2cf2ab63a0cd7ce5606163682 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 11:30:22 -0400 Subject: [PATCH 6/9] ruff reformats --- llm_github/core.py | 67 +++++++++++++++++++++++++++++-------------- llm_github/execute.py | 9 +++--- tests/test_core.py | 7 +++-- 3 files changed, 55 insertions(+), 28 deletions(-) diff --git a/llm_github/core.py b/llm_github/core.py index cb4ca99..bf11792 100644 --- a/llm_github/core.py +++ b/llm_github/core.py @@ -4,6 +4,7 @@ import requests from requests_cache import CachedSession +from typing_extensions import TypedDict # Use from typing_extensions for compatibility with older Python versions REQUESTS_TIMEOUT = 10 # Timeout in seconds for requests @@ -34,22 +35,43 @@ class EnvironmentVariableError(Exception): """Exception raised for errors in the environment variables.""" def __init__(self, variable: str, message: str = "is not set in the environment.") -> None: - self.variable = variable - self.message = message + self.variable: str = variable + self.message: str = message super().__init__(f"{variable} {message}") +class RateLimit(TypedDict): + limit: int + remaining: int + reset: int + used: int + + +class RateLimitResources(TypedDict): + core: RateLimit + graphql: RateLimit + # Add other resources as needed + + +class RateLimitResponse(TypedDict): + rate: RateLimit + resources: RateLimitResources + remaining: int + reset: int + + def return_verbatim(input_string: str) -> str: """Return the input string.""" return input_string -def get_rate_limit(token: str, session: CachedSession) -> Dict[str, int]: +def get_rate_limit(token: str, session: CachedSession) -> RateLimitResponse: """Fetch current rate limit status from GitHub API.""" headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/rate_limit", headers=headers, timeout=REQUESTS_TIMEOUT) response.raise_for_status() # Raises HTTPError for bad requests - return response.json()["rate"] + rate_limit_response: RateLimitResponse = response.json() + return rate_limit_response def wait_for_rate_limit_reset(reset_time: int) -> None: @@ -90,21 +112,23 @@ def github_token_check(token: str, session: CachedSession) -> Optional[Dict[str, response = session.get("https://api.github.com/user", headers=headers, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: print("Token is valid. User data retrieved successfully.") - return response.json() - print(f"Failed to authenticate. Status code: {response.status_code}") + user_data: Dict[str, Any] = response.json() + return user_data + handle_response_errors(response) return None def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """List all organizations the user is a member of.""" rate_limit = get_rate_limit(token, session) - if rate_limit["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["reset"]) + if rate_limit["resources"]["core"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/user/orgs", headers=headers, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: print("Organizations retrieved successfully.") - return response.json() + orgs: List[Dict[str, Any]] = response.json() + return orgs handle_response_errors(response) return None @@ -112,8 +136,8 @@ def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict[str def get_repos(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch all repositories for a given organization.""" rate_limit = get_rate_limit(token, session) - if rate_limit["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["reset"]) + if rate_limit["resources"]["core"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) repos: List[Dict[str, Any]] = [] url = f"https://api.github.com/orgs/{org}/repos" headers = {"Authorization": f"token {token}"} @@ -141,8 +165,8 @@ def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[ url = repo["issues_url"].replace("{/number}", "?state=all") while url: rate_limit = get_rate_limit(token, session) # Check rate limit before each request - if rate_limit["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["reset"]) + if rate_limit["resources"]["core"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: @@ -200,8 +224,8 @@ def fetch_pull_requests(org: str, token: str, session: CachedSession) -> Optiona url = f"{repo['url']}/pulls?state=all" while url: rate_limit = get_rate_limit(token, session) # Check rate limit before each request - if rate_limit["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["reset"]) + if rate_limit["resources"]["core"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: @@ -227,8 +251,7 @@ def process_pull_requests(pull_requests: List[Dict[str, Any]], keys_to_remove: L def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: - """Fetch all comments from all repositories in an organization, - distinguishing between issue and PR comments, while handling pagination and rate limits.""" + """Fetch all comments from all repositories in an organization, distinguishing between issue and PR comments, while handling pagination and rate limits.""" all_comments: List[Dict[str, Any]] = [] repos = get_repos(org, token, session) if not repos: @@ -240,8 +263,8 @@ def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional url = f"{repo['url']}/issues/comments?per_page=100" while url: rate_limit = get_rate_limit(token, session) # Check rate limit before each request - if rate_limit["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["reset"]) + if rate_limit["resources"]["core"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: @@ -285,7 +308,7 @@ def fetch_all_discussions(org: str, token: str, session: CachedSession) -> Optio all_discussions.extend(discussions) else: print(f"No discussions found or an error occurred for repository: {repo_name}") - return all_discussions + return all_discussions if all_discussions else None def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[Dict[str, Any]]]: @@ -318,13 +341,13 @@ def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[ } """ variables = {"org": org, "repo": repo} - # Added a timeout of 10 seconds response = requests.post(url, json={"query": query, "variables": variables}, headers=headers, timeout=10) if response.status_code == 200: data = response.json() if "errors" in data: print(f"GraphQL Errors: {json.dumps(data['errors'], indent=2)}") - return data.get("data", {}).get("repository", {}).get("discussions", {}).get("nodes", []) + discussions = data.get("data", {}).get("repository", {}).get("discussions", {}).get("nodes", []) + return discussions if discussions is not None else [] print(f"Failed to fetch discussions. Status code: {response.status_code}") print("Response: ", response.text) return None diff --git a/llm_github/execute.py b/llm_github/execute.py index 7a2ab58..0006db2 100644 --- a/llm_github/execute.py +++ b/llm_github/execute.py @@ -1,8 +1,12 @@ import os from typing import Dict, List, Optional +from dotenv import load_dotenv +from requests_cache import CachedSession +from requests_cache.backends.sqlite import SQLiteCache + # Fixing import conflicts by adjusting namespace and avoiding re-importing CachedSession -from core import ( +from llm_github.core import ( DEFAULT_DROPPED_FIELDS, EnvironmentVariableError, fetch_all_comments, @@ -18,9 +22,6 @@ process_pull_requests, write_json_to_file, ) -from dotenv import load_dotenv -from requests_cache import CachedSession -from requests_cache.backends.sqlite import SQLiteCache # Load environment variables from .env file load_dotenv(dotenv_path="local/.env", verbose=True) diff --git a/tests/test_core.py b/tests/test_core.py index 88d95a8..3f0a3b4 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -10,8 +10,11 @@ def test_return_verbatim(): def test_get_rate_limit(): mock_session = Mock() mock_session.get.return_value.status_code = 200 - mock_session.get.return_value.json.return_value = {"rate": {"limit": 5000, "remaining": 4999}} + mock_session.get.return_value.json.return_value = { + "rate": {"limit": 5000, "remaining": 4999, "reset": 1234567890, "used": 1} + } token = "fake_token" # noqa: S105 result = get_rate_limit(token, mock_session) - assert result == {"limit": 5000, "remaining": 4999} + expected = {"limit": 5000, "remaining": 4999, "reset": 1234567890, "used": 1} + assert result["rate"] == expected From 1caa867a01c9f4f471f1a88139464b7bef809538 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 11:40:55 -0400 Subject: [PATCH 7/9] experimentign with dev dependencies --- poetry.lock | 2 +- pyproject.toml | 41 ++++++++++++----------------------------- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3ec6125..9e5203a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1412,4 +1412,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "20dc492ca082a6cbb83ce664ff5b6c5749e847c4370d0bda28955b2cbd9eb67d" +content-hash = "b96e61b42223ef20b9be841454933163b8fc2dc93e145ef8231310088467a481" diff --git a/pyproject.toml b/pyproject.toml index 4b688a2..7f08748 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,14 +7,13 @@ repository = "https://github.com/turbomam/llm-github" documentation = "https://turbomam.github.io/llm-github/" readme = "README.md" packages = [ - {include = "llm_github"} + { include = "llm_github" } ] [tool.poetry.dependencies] python = ">=3.8,<4.0" python-dotenv = "^1.0.1" requests-cache = "^1.2.1" -types-requests = "^2.32.0.20240622" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" @@ -23,11 +22,12 @@ deptry = "^0.12.0" mypy = "^1.5.1" pre-commit = "^3.4.0" tox = "^4.11.1" +types-requests = "^2.32.0.20240622" [tool.poetry.group.docs.dependencies] mkdocs = "^1.4.2" mkdocs-material = "^9.2.7" -mkdocstrings = {extras = ["python"], version = "^0.23.0"} +mkdocstrings = { extras = ["python"], version = "^0.23.0" } [build-system] requires = ["poetry-core>=1.0.0"] @@ -35,13 +35,13 @@ build-backend = "poetry.core.masonry.api" [tool.mypy] files = ["llm_github"] -disallow_untyped_defs = "True" -disallow_any_unimported = "True" -no_implicit_optional = "True" -check_untyped_defs = "True" -warn_return_any = "True" -warn_unused_ignores = "True" -show_error_codes = "True" +disallow_untyped_defs = true +disallow_any_unimported = true +no_implicit_optional = true +check_untyped_defs = true +warn_return_any = true +warn_unused_ignores = true +show_error_codes = true [tool.pytest.ini_options] testpaths = ["tests"] @@ -51,41 +51,25 @@ target-version = "py37" line-length = 120 fix = true select = [ - # flake8-2020 "YTT", - # flake8-bandit "S", - # flake8-bugbear "B", - # flake8-builtins "A", - # flake8-comprehensions "C4", - # flake8-debugger "T10", - # flake8-simplify "SIM", - # isort "I", - # mccabe "C90", - # pycodestyle - "E", "W", - # pyflakes + "E", + "W", "F", - # pygrep-hooks "PGH", - # pyupgrade "UP", - # ruff "RUF", - # tryceratops "TRY", ] ignore = [ - # LineTooLong "E501", - # DoNotAssignLambda "E731", ] @@ -99,6 +83,5 @@ skip_empty = true branch = true source = ["llm_github"] - [tool.ruff.per-file-ignores] "tests/*" = ["S101"] From 6f6edc9f3965cbe28fee24c7bc300d4742fa79ef Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 12:02:20 -0400 Subject: [PATCH 8/9] running make check locally --- llm_github/core.py | 66 ++++++++++++++++++++++------------------------ poetry.lock | 2 +- pyproject.toml | 1 + 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/llm_github/core.py b/llm_github/core.py index bf11792..efeddb7 100644 --- a/llm_github/core.py +++ b/llm_github/core.py @@ -9,7 +9,7 @@ REQUESTS_TIMEOUT = 10 # Timeout in seconds for requests # Default fields to be dropped from responses -DEFAULT_DROPPED_FIELDS: List[str] = [ +DEFAULT_DROPPED_FIELDS = [ "_links", "base", "comments_url", @@ -35,8 +35,8 @@ class EnvironmentVariableError(Exception): """Exception raised for errors in the environment variables.""" def __init__(self, variable: str, message: str = "is not set in the environment.") -> None: - self.variable: str = variable - self.message: str = message + self.variable = variable + self.message = message super().__init__(f"{variable} {message}") @@ -47,17 +47,9 @@ class RateLimit(TypedDict): used: int -class RateLimitResources(TypedDict): - core: RateLimit - graphql: RateLimit - # Add other resources as needed - - class RateLimitResponse(TypedDict): rate: RateLimit - resources: RateLimitResources - remaining: int - reset: int + resources: Dict[str, RateLimit] def return_verbatim(input_string: str) -> str: @@ -70,8 +62,8 @@ def get_rate_limit(token: str, session: CachedSession) -> RateLimitResponse: headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/rate_limit", headers=headers, timeout=REQUESTS_TIMEOUT) response.raise_for_status() # Raises HTTPError for bad requests - rate_limit_response: RateLimitResponse = response.json() - return rate_limit_response + data: RateLimitResponse = response.json() + return data def wait_for_rate_limit_reset(reset_time: int) -> None: @@ -112,23 +104,23 @@ def github_token_check(token: str, session: CachedSession) -> Optional[Dict[str, response = session.get("https://api.github.com/user", headers=headers, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: print("Token is valid. User data retrieved successfully.") - user_data: Dict[str, Any] = response.json() - return user_data - handle_response_errors(response) + data: Dict[str, Any] = response.json() + return data + print(f"Failed to authenticate. Status code: {response.status_code}") return None def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """List all organizations the user is a member of.""" rate_limit = get_rate_limit(token, session) - if rate_limit["resources"]["core"]["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) + if rate_limit["rate"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["rate"]["reset"]) headers = {"Authorization": f"token {token}"} response = session.get("https://api.github.com/user/orgs", headers=headers, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: print("Organizations retrieved successfully.") - orgs: List[Dict[str, Any]] = response.json() - return orgs + data: List[Dict[str, Any]] = response.json() + return data handle_response_errors(response) return None @@ -136,8 +128,8 @@ def list_user_orgs(token: str, session: CachedSession) -> Optional[List[Dict[str def get_repos(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: """Fetch all repositories for a given organization.""" rate_limit = get_rate_limit(token, session) - if rate_limit["resources"]["core"]["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) + if rate_limit["rate"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["rate"]["reset"]) repos: List[Dict[str, Any]] = [] url = f"https://api.github.com/orgs/{org}/repos" headers = {"Authorization": f"token {token}"} @@ -165,8 +157,8 @@ def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[ url = repo["issues_url"].replace("{/number}", "?state=all") while url: rate_limit = get_rate_limit(token, session) # Check rate limit before each request - if rate_limit["resources"]["core"]["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) + if rate_limit["rate"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["rate"]["reset"]) response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: @@ -180,7 +172,7 @@ def fetch_issues(org: str, token: str, session: CachedSession) -> Optional[List[ return issues -def sanitize_user_data(data: Any) -> Any: +def sanitize_user_data(data: Dict[str, Any]) -> Dict[str, Any]: """Recursively sanitize user data to keep only the user 'login'.""" if isinstance(data, dict): if "login" in data and set(data.keys()) - {"login"}: @@ -224,8 +216,8 @@ def fetch_pull_requests(org: str, token: str, session: CachedSession) -> Optiona url = f"{repo['url']}/pulls?state=all" while url: rate_limit = get_rate_limit(token, session) # Check rate limit before each request - if rate_limit["resources"]["core"]["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) + if rate_limit["rate"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["rate"]["reset"]) response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: @@ -251,7 +243,8 @@ def process_pull_requests(pull_requests: List[Dict[str, Any]], keys_to_remove: L def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional[List[Dict[str, Any]]]: - """Fetch all comments from all repositories in an organization, distinguishing between issue and PR comments, while handling pagination and rate limits.""" + """Fetch all comments from all repositories in an organization, + distinguishing between issue and PR comments, while handling pagination and rate limits.""" all_comments: List[Dict[str, Any]] = [] repos = get_repos(org, token, session) if not repos: @@ -263,8 +256,8 @@ def fetch_all_comments(org: str, token: str, session: CachedSession) -> Optional url = f"{repo['url']}/issues/comments?per_page=100" while url: rate_limit = get_rate_limit(token, session) # Check rate limit before each request - if rate_limit["resources"]["core"]["remaining"] == 0: - wait_for_rate_limit_reset(rate_limit["resources"]["core"]["reset"]) + if rate_limit["rate"]["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["rate"]["reset"]) response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) if response.status_code == 200: @@ -308,7 +301,7 @@ def fetch_all_discussions(org: str, token: str, session: CachedSession) -> Optio all_discussions.extend(discussions) else: print(f"No discussions found or an error occurred for repository: {repo_name}") - return all_discussions if all_discussions else None + return all_discussions def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[Dict[str, Any]]]: @@ -341,13 +334,16 @@ def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[ } """ variables = {"org": org, "repo": repo} + # Added a timeout of 10 seconds response = requests.post(url, json={"query": query, "variables": variables}, headers=headers, timeout=10) if response.status_code == 200: - data = response.json() + data: Any = response.json() if "errors" in data: print(f"GraphQL Errors: {json.dumps(data['errors'], indent=2)}") - discussions = data.get("data", {}).get("repository", {}).get("discussions", {}).get("nodes", []) - return discussions if discussions is not None else [] + nodes: Optional[List[Dict[str, Any]]] = ( + data.get("data", {}).get("repository", {}).get("discussions", {}).get("nodes", []) + ) + return nodes print(f"Failed to fetch discussions. Status code: {response.status_code}") print("Response: ", response.text) return None diff --git a/poetry.lock b/poetry.lock index 9e5203a..7c99c4a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1412,4 +1412,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "b96e61b42223ef20b9be841454933163b8fc2dc93e145ef8231310088467a481" +content-hash = "5a7d1100dec7ecb62283bd2007b7638c4eebbbc9d12f8b3ad4ad15220524fd48" diff --git a/pyproject.toml b/pyproject.toml index 7f08748..81b3267 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ packages = [ python = ">=3.8,<4.0" python-dotenv = "^1.0.1" requests-cache = "^1.2.1" +requests = "^2.32.3" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" From 1f61f7748c4d1e8ce96e88ab6c31606f88f1bed4 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 12:12:12 -0400 Subject: [PATCH 9/9] make check ok locally --- poetry.lock | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 7c99c4a..fc850bb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1412,4 +1412,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "5a7d1100dec7ecb62283bd2007b7638c4eebbbc9d12f8b3ad4ad15220524fd48" +content-hash = "31ee1c4d060296bd27aa244fd375dceedd6a70f27cc03f0d7537ce15f8e18739" diff --git a/pyproject.toml b/pyproject.toml index 81b3267..a63dda9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ python = ">=3.8,<4.0" python-dotenv = "^1.0.1" requests-cache = "^1.2.1" requests = "^2.32.3" +typing-extensions = "^4.12.2" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0"