|
| 1 | +class RepoLoader { |
| 2 | + constructor(args = {}) { |
| 3 | + this.ready = false; |
| 4 | + this.repo = args?.repo; |
| 5 | + this.branch = args?.branch; |
| 6 | + this.accessToken = args?.accessToken || null; |
| 7 | + this.ignorePaths = args?.ignorePaths || []; |
| 8 | + |
| 9 | + this.author = null; |
| 10 | + this.project = null; |
| 11 | + this.branches = []; |
| 12 | + } |
| 13 | + |
| 14 | + #validGithubUrl() { |
| 15 | + const UrlPattern = require("url-pattern"); |
| 16 | + const pattern = new UrlPattern("https\\://github.com/(:author)/(:project)"); |
| 17 | + const match = pattern.match(this.repo); |
| 18 | + if (!match) return false; |
| 19 | + |
| 20 | + this.author = match.author; |
| 21 | + this.project = match.project; |
| 22 | + return true; |
| 23 | + } |
| 24 | + |
| 25 | + // Ensure the branch provided actually exists |
| 26 | + // and if it does not or has not been set auto-assign to primary branch. |
| 27 | + async #validBranch() { |
| 28 | + await this.getRepoBranches(); |
| 29 | + if (!!this.branch && this.branches.includes(this.branch)) return; |
| 30 | + |
| 31 | + console.log( |
| 32 | + "[Github Loader]: Branch not set! Auto-assigning to a default branch." |
| 33 | + ); |
| 34 | + this.branch = this.branches.includes("main") ? "main" : "master"; |
| 35 | + console.log(`[Github Loader]: Branch auto-assigned to ${this.branch}.`); |
| 36 | + return; |
| 37 | + } |
| 38 | + |
| 39 | + async #validateAccessToken() { |
| 40 | + if (!this.accessToken) return; |
| 41 | + const valid = await fetch("https://api.github.com/octocat", { |
| 42 | + method: "GET", |
| 43 | + headers: { |
| 44 | + Authorization: `Bearer ${this.accessToken}`, |
| 45 | + "X-GitHub-Api-Version": "2022-11-28", |
| 46 | + }, |
| 47 | + }) |
| 48 | + .then((res) => { |
| 49 | + if (!res.ok) throw new Error(res.statusText); |
| 50 | + return res.ok; |
| 51 | + }) |
| 52 | + .catch((e) => { |
| 53 | + console.error( |
| 54 | + "Invalid Github Access Token provided! Access token will not be used", |
| 55 | + e.message |
| 56 | + ); |
| 57 | + return false; |
| 58 | + }); |
| 59 | + |
| 60 | + if (!valid) this.accessToken = null; |
| 61 | + return; |
| 62 | + } |
| 63 | + |
| 64 | + async init() { |
| 65 | + if (!this.#validGithubUrl()) return; |
| 66 | + await this.#validBranch(); |
| 67 | + await this.#validateAccessToken(); |
| 68 | + this.ready = true; |
| 69 | + return this; |
| 70 | + } |
| 71 | + |
| 72 | + async recursiveLoader() { |
| 73 | + if (!this.ready) throw new Error("[Github Loader]: not in ready state!"); |
| 74 | + const { |
| 75 | + GithubRepoLoader: LCGithubLoader, |
| 76 | + } = require("langchain/document_loaders/web/github"); |
| 77 | + |
| 78 | + if (this.accessToken) |
| 79 | + console.log( |
| 80 | + `[Github Loader]: Access token set! Recursive loading enabled!` |
| 81 | + ); |
| 82 | + |
| 83 | + const loader = new LCGithubLoader(this.repo, { |
| 84 | + accessToken: this.accessToken, |
| 85 | + branch: this.branch, |
| 86 | + recursive: !!this.accessToken, // Recursive will hit rate limits. |
| 87 | + maxConcurrency: 5, |
| 88 | + unknown: "ignore", |
| 89 | + ignorePaths: this.ignorePaths, |
| 90 | + }); |
| 91 | + |
| 92 | + const docs = []; |
| 93 | + for await (const doc of loader.loadAsStream()) docs.push(doc); |
| 94 | + return docs; |
| 95 | + } |
| 96 | + |
| 97 | + // Sort branches to always show either main or master at the top of the result. |
| 98 | + #branchPrefSort(branches = []) { |
| 99 | + const preferredSort = ["main", "master"]; |
| 100 | + return branches.reduce((acc, branch) => { |
| 101 | + if (preferredSort.includes(branch)) return [branch, ...acc]; |
| 102 | + return [...acc, branch]; |
| 103 | + }, []); |
| 104 | + } |
| 105 | + |
| 106 | + // Get all branches for a given repo. |
| 107 | + async getRepoBranches() { |
| 108 | + if (!this.#validGithubUrl() || !this.author || !this.project) return []; |
| 109 | + await this.#validateAccessToken(); // Ensure API access token is valid for pre-flight |
| 110 | + |
| 111 | + let page = 0; |
| 112 | + let polling = true; |
| 113 | + const branches = []; |
| 114 | + |
| 115 | + while (polling) { |
| 116 | + console.log(`Fetching page ${page} of branches for ${this.project}`); |
| 117 | + await fetch( |
| 118 | + `https://api.github.com/repos/${this.author}/${this.project}/branches?per_page=100&page=${page}`, |
| 119 | + { |
| 120 | + method: "GET", |
| 121 | + headers: { |
| 122 | + ...(this.accessToken |
| 123 | + ? { Authorization: `Bearer ${this.accessToken}` } |
| 124 | + : {}), |
| 125 | + "X-GitHub-Api-Version": "2022-11-28", |
| 126 | + }, |
| 127 | + } |
| 128 | + ) |
| 129 | + .then((res) => { |
| 130 | + if (res.ok) return res.json(); |
| 131 | + throw new Error(`Invalid request to Github API: ${res.statusText}`); |
| 132 | + }) |
| 133 | + .then((branchObjects) => { |
| 134 | + polling = branchObjects.length > 0; |
| 135 | + branches.push(branchObjects.map((branch) => branch.name)); |
| 136 | + page++; |
| 137 | + }) |
| 138 | + .catch((err) => { |
| 139 | + polling = false; |
| 140 | + console.log(`RepoLoader.branches`, err); |
| 141 | + }); |
| 142 | + } |
| 143 | + |
| 144 | + this.branches = [...new Set(branches.flat())]; |
| 145 | + return this.#branchPrefSort(this.branches); |
| 146 | + } |
| 147 | +} |
| 148 | + |
| 149 | +module.exports = RepoLoader; |
0 commit comments