Skip to content

Commit 4525824

Browse files
GitHub loader extension + extension support v1 (#469)
* feat: implement github repo loading fix: purge of folders fix: rendering of sub-files * noshow delete on custom-documents * Add API key support because of rate limits * WIP for frontend of data connectors * wip * Add frontend form for GitHub repo data connector * remove console.logs block custom-documents from being deleted * remove _meta unused arg * Add support for ignore pathing in request Ignore path input via tagging * Update hint
1 parent 2d700b1 commit 4525824

File tree

32 files changed

+975
-128
lines changed

32 files changed

+975
-128
lines changed

collector/extensions/index.js

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
const { reqBody } = require("../utils/http");
2+
3+
function extensions(app) {
4+
if (!app) return;
5+
6+
app.post("/ext/github-repo", async function (request, response) {
7+
try {
8+
const loadGithubRepo = require("../utils/extensions/GithubRepo");
9+
const { success, reason, data } = await loadGithubRepo(reqBody(request));
10+
response.status(200).json({
11+
success,
12+
reason,
13+
data
14+
});
15+
} catch (e) {
16+
console.error(e);
17+
response.status(200).json({
18+
success: false,
19+
reason: e.message || "A processing error occurred.",
20+
data: {},
21+
});
22+
}
23+
return;
24+
});
25+
26+
// gets all branches for a specific repo
27+
app.post("/ext/github-repo/branches", async function (request, response) {
28+
try {
29+
const GithubRepoLoader = require("../utils/extensions/GithubRepo/RepoLoader");
30+
const allBranches = await (new GithubRepoLoader(reqBody(request))).getRepoBranches()
31+
response.status(200).json({
32+
success: true,
33+
reason: null,
34+
data: {
35+
branches: allBranches
36+
}
37+
});
38+
} catch (e) {
39+
console.error(e);
40+
response.status(400).json({
41+
success: false,
42+
reason: e.message,
43+
data: {
44+
branches: []
45+
}
46+
});
47+
}
48+
return;
49+
});
50+
}
51+
52+
module.exports = extensions;

collector/index.js

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ const { reqBody } = require("./utils/http");
1111
const { processSingleFile } = require("./processSingleFile");
1212
const { processLink } = require("./processLink");
1313
const { wipeCollectorStorage } = require("./utils/files");
14+
const extensions = require("./extensions");
1415
const app = express();
1516

1617
app.use(cors({ origin: true }));
@@ -57,6 +58,8 @@ app.post("/process-link", async function (request, response) {
5758
return;
5859
});
5960

61+
extensions(app);
62+
6063
app.get("/accepts", function (_, response) {
6164
response.status(200).json(ACCEPTED_MIMES);
6265
});

collector/package.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"express": "^4.18.2",
2525
"extract-zip": "^2.0.1",
2626
"fluent-ffmpeg": "^2.1.2",
27+
"ignore": "^5.3.0",
2728
"js-tiktoken": "^1.0.8",
2829
"langchain": "0.0.201",
2930
"mammoth": "^1.6.0",
@@ -35,11 +36,12 @@
3536
"pdf-parse": "^1.1.1",
3637
"puppeteer": "^21.6.1",
3738
"slugify": "^1.6.6",
39+
"url-pattern": "^1.0.3",
3840
"uuid": "^9.0.0",
3941
"wavefile": "^11.0.0"
4042
},
4143
"devDependencies": {
4244
"nodemon": "^2.0.22",
4345
"prettier": "^2.4.1"
4446
}
45-
}
47+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
class RepoLoader {
2+
constructor(args = {}) {
3+
this.ready = false;
4+
this.repo = args?.repo;
5+
this.branch = args?.branch;
6+
this.accessToken = args?.accessToken || null;
7+
this.ignorePaths = args?.ignorePaths || [];
8+
9+
this.author = null;
10+
this.project = null;
11+
this.branches = [];
12+
}
13+
14+
#validGithubUrl() {
15+
const UrlPattern = require("url-pattern");
16+
const pattern = new UrlPattern("https\\://github.com/(:author)/(:project)");
17+
const match = pattern.match(this.repo);
18+
if (!match) return false;
19+
20+
this.author = match.author;
21+
this.project = match.project;
22+
return true;
23+
}
24+
25+
// Ensure the branch provided actually exists
26+
// and if it does not or has not been set auto-assign to primary branch.
27+
async #validBranch() {
28+
await this.getRepoBranches();
29+
if (!!this.branch && this.branches.includes(this.branch)) return;
30+
31+
console.log(
32+
"[Github Loader]: Branch not set! Auto-assigning to a default branch."
33+
);
34+
this.branch = this.branches.includes("main") ? "main" : "master";
35+
console.log(`[Github Loader]: Branch auto-assigned to ${this.branch}.`);
36+
return;
37+
}
38+
39+
async #validateAccessToken() {
40+
if (!this.accessToken) return;
41+
const valid = await fetch("https://api.github.com/octocat", {
42+
method: "GET",
43+
headers: {
44+
Authorization: `Bearer ${this.accessToken}`,
45+
"X-GitHub-Api-Version": "2022-11-28",
46+
},
47+
})
48+
.then((res) => {
49+
if (!res.ok) throw new Error(res.statusText);
50+
return res.ok;
51+
})
52+
.catch((e) => {
53+
console.error(
54+
"Invalid Github Access Token provided! Access token will not be used",
55+
e.message
56+
);
57+
return false;
58+
});
59+
60+
if (!valid) this.accessToken = null;
61+
return;
62+
}
63+
64+
async init() {
65+
if (!this.#validGithubUrl()) return;
66+
await this.#validBranch();
67+
await this.#validateAccessToken();
68+
this.ready = true;
69+
return this;
70+
}
71+
72+
async recursiveLoader() {
73+
if (!this.ready) throw new Error("[Github Loader]: not in ready state!");
74+
const {
75+
GithubRepoLoader: LCGithubLoader,
76+
} = require("langchain/document_loaders/web/github");
77+
78+
if (this.accessToken)
79+
console.log(
80+
`[Github Loader]: Access token set! Recursive loading enabled!`
81+
);
82+
83+
const loader = new LCGithubLoader(this.repo, {
84+
accessToken: this.accessToken,
85+
branch: this.branch,
86+
recursive: !!this.accessToken, // Recursive will hit rate limits.
87+
maxConcurrency: 5,
88+
unknown: "ignore",
89+
ignorePaths: this.ignorePaths,
90+
});
91+
92+
const docs = [];
93+
for await (const doc of loader.loadAsStream()) docs.push(doc);
94+
return docs;
95+
}
96+
97+
// Sort branches to always show either main or master at the top of the result.
98+
#branchPrefSort(branches = []) {
99+
const preferredSort = ["main", "master"];
100+
return branches.reduce((acc, branch) => {
101+
if (preferredSort.includes(branch)) return [branch, ...acc];
102+
return [...acc, branch];
103+
}, []);
104+
}
105+
106+
// Get all branches for a given repo.
107+
async getRepoBranches() {
108+
if (!this.#validGithubUrl() || !this.author || !this.project) return [];
109+
await this.#validateAccessToken(); // Ensure API access token is valid for pre-flight
110+
111+
let page = 0;
112+
let polling = true;
113+
const branches = [];
114+
115+
while (polling) {
116+
console.log(`Fetching page ${page} of branches for ${this.project}`);
117+
await fetch(
118+
`https://api.github.com/repos/${this.author}/${this.project}/branches?per_page=100&page=${page}`,
119+
{
120+
method: "GET",
121+
headers: {
122+
...(this.accessToken
123+
? { Authorization: `Bearer ${this.accessToken}` }
124+
: {}),
125+
"X-GitHub-Api-Version": "2022-11-28",
126+
},
127+
}
128+
)
129+
.then((res) => {
130+
if (res.ok) return res.json();
131+
throw new Error(`Invalid request to Github API: ${res.statusText}`);
132+
})
133+
.then((branchObjects) => {
134+
polling = branchObjects.length > 0;
135+
branches.push(branchObjects.map((branch) => branch.name));
136+
page++;
137+
})
138+
.catch((err) => {
139+
polling = false;
140+
console.log(`RepoLoader.branches`, err);
141+
});
142+
}
143+
144+
this.branches = [...new Set(branches.flat())];
145+
return this.#branchPrefSort(this.branches);
146+
}
147+
}
148+
149+
module.exports = RepoLoader;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
const RepoLoader = require("./RepoLoader");
2+
const fs = require("fs");
3+
const path = require("path");
4+
const { default: slugify } = require("slugify");
5+
const { v4 } = require("uuid");
6+
const { writeToServerDocuments } = require("../../files");
7+
const { tokenizeString } = require("../../tokenizer");
8+
9+
async function loadGithubRepo(args) {
10+
const repo = new RepoLoader(args);
11+
await repo.init();
12+
13+
if (!repo.ready)
14+
return {
15+
success: false,
16+
reason: "Could not prepare Github repo for loading! Check URL",
17+
};
18+
19+
console.log(
20+
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} --`
21+
);
22+
const docs = await repo.recursiveLoader();
23+
if (!docs.length) {
24+
return {
25+
success: false,
26+
reason: "No files were found for those settings.",
27+
};
28+
}
29+
30+
console.log(`[Github Loader]: Found ${docs.length} source files. Saving...`);
31+
const outFolder = slugify(
32+
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
33+
).toLowerCase();
34+
const outFolderPath = path.resolve(
35+
__dirname,
36+
`../../../../server/storage/documents/${outFolder}`
37+
);
38+
fs.mkdirSync(outFolderPath);
39+
40+
for (const doc of docs) {
41+
if (!doc.pageContent) continue;
42+
const data = {
43+
id: v4(),
44+
url: "github://" + doc.metadata.source,
45+
title: doc.metadata.source,
46+
docAuthor: repo.author,
47+
description: "No description found.",
48+
docSource: repo.repo,
49+
chunkSource: doc.metadata.source,
50+
published: new Date().toLocaleString(),
51+
wordCount: doc.pageContent.split(" ").length,
52+
pageContent: doc.pageContent,
53+
token_count_estimate: tokenizeString(doc.pageContent).length,
54+
};
55+
console.log(
56+
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
57+
);
58+
writeToServerDocuments(
59+
data,
60+
`${slugify(doc.metadata.source)}-${data.id}`,
61+
outFolderPath
62+
);
63+
}
64+
65+
return {
66+
success: true,
67+
reason: null,
68+
data: {
69+
author: repo.author,
70+
repo: repo.project,
71+
branch: repo.branch,
72+
files: docs.length,
73+
destination: outFolder,
74+
},
75+
};
76+
}
77+
78+
module.exports = loadGithubRepo;

collector/yarn.lock

+10
Original file line numberDiff line numberDiff line change
@@ -1530,6 +1530,11 @@ ignore-by-default@^1.0.1:
15301530
resolved "https://registry.yarnpkg.com/ignore-by-default/-/ignore-by-default-1.0.1.tgz#48ca6d72f6c6a3af00a9ad4ae6876be3889e2b09"
15311531
integrity sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==
15321532

1533+
ignore@^5.3.0:
1534+
version "5.3.0"
1535+
resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.3.0.tgz#67418ae40d34d6999c95ff56016759c718c82f78"
1536+
integrity sha512-g7dmpshy+gD7mh88OC9NwSGTKoc3kyLAZQRU1mt53Aw/vnvfXnbC+F/7F7QoYVKbV+KNvJx8wArewKy1vXMtlg==
1537+
15331538
immediate@~3.0.5:
15341539
version "3.0.6"
15351540
resolved "https://registry.yarnpkg.com/immediate/-/immediate-3.0.6.tgz#9db1dbd0faf8de6fbe0f5dd5e56bb606280de69b"
@@ -3127,6 +3132,11 @@ [email protected], unpipe@~1.0.0:
31273132
resolved "https://registry.yarnpkg.com/unpipe/-/unpipe-1.0.0.tgz#b2bf4ee8514aae6165b4817829d21b2ef49904ec"
31283133
integrity sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==
31293134

3135+
url-pattern@^1.0.3:
3136+
version "1.0.3"
3137+
resolved "https://registry.yarnpkg.com/url-pattern/-/url-pattern-1.0.3.tgz#0409292471b24f23c50d65a47931793d2b5acfc1"
3138+
integrity sha512-uQcEj/2puA4aq1R3A2+VNVBgaWYR24FdWjl7VNW83rnWftlhyzOZ/tBjezRiC2UkIzuxC8Top3IekN3vUf1WxA==
3139+
31303140
url-template@^2.0.8:
31313141
version "2.0.8"
31323142
resolved "https://registry.yarnpkg.com/url-template/-/url-template-2.0.8.tgz#fc565a3cccbff7730c775f5641f9555791439f21"

frontend/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"react-loading-icons": "^1.1.0",
2828
"react-loading-skeleton": "^3.1.0",
2929
"react-router-dom": "^6.3.0",
30+
"react-tag-input-component": "^2.0.2",
3031
"react-toastify": "^9.1.3",
3132
"text-case": "^1.0.9",
3233
"truncate": "^3.0.0",

frontend/src/App.jsx

+15
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ const GeneralExportImport = lazy(() =>
3636
import("@/pages/GeneralSettings/ExportImport")
3737
);
3838
const GeneralSecurity = lazy(() => import("@/pages/GeneralSettings/Security"));
39+
const DataConnectors = lazy(() =>
40+
import("@/pages/GeneralSettings/DataConnectors")
41+
);
42+
const DataConnectorSetup = lazy(() =>
43+
import("@/pages/GeneralSettings/DataConnectors/Connectors")
44+
);
3945
const OnboardingFlow = lazy(() => import("@/pages/OnboardingFlow"));
4046

4147
export default function App() {
@@ -103,6 +109,15 @@ export default function App() {
103109
path="/settings/workspaces"
104110
element={<ManagerRoute Component={AdminWorkspaces} />}
105111
/>
112+
<Route
113+
path="/settings/data-connectors"
114+
element={<ManagerRoute Component={DataConnectors} />}
115+
/>
116+
<Route
117+
path="/settings/data-connectors/:connector"
118+
element={<ManagerRoute Component={DataConnectorSetup} />}
119+
/>
120+
106121
{/* Onboarding Flow */}
107122
<Route path="/onboarding" element={<OnboardingFlow />} />
108123
</Routes>

0 commit comments

Comments
 (0)