Skip to content

Commit

Permalink
merge master
Browse files Browse the repository at this point in the history
  • Loading branch information
timothycarambat committed Oct 22, 2024
2 parents 53e0cd1 + 79ce26d commit a6a5084
Show file tree
Hide file tree
Showing 98 changed files with 2,917 additions and 839 deletions.
7 changes: 5 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"AIbitat",
"allm",
"anythingllm",
"Apipie",
"Astra",
"Chartable",
"cleancss",
Expand All @@ -18,6 +19,7 @@
"elevenlabs",
"Embeddable",
"epub",
"fireworksai",
"GROQ",
"hljs",
"huggingface",
Expand All @@ -40,17 +42,18 @@
"pagerender",
"Qdrant",
"royalblue",
"searxng",
"SearchApi",
"searxng",
"Serper",
"Serply",
"streamable",
"textgenwebui",
"togetherai",
"fireworksai",
"Unembed",
"uuidv",
"vectordbs",
"Weaviate",
"XAILLM",
"Zilliz"
],
"eslint.experimental.useFlatConfig": true,
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
- [KoboldCPP](https://github.com/LostRuins/koboldcpp)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [Text Generation Web UI](https://github.com/oobabooga/text-generation-webui)
- [Apipie](https://apipie.ai/)
- [xAI](https://x.ai/)

**Embedder models:**

Expand All @@ -116,6 +118,7 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
- [PiperTTSLocal - runs in browser](https://github.com/rhasspy/piper)
- [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech/voice-options)
- [ElevenLabs](https://elevenlabs.io/)
- Any OpenAI Compatible TTS service.

**STT (speech-to-text) support:**

Expand Down
6 changes: 4 additions & 2 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ const extensions = require("./extensions");
const { processRawText } = require("./processRawText");
const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
const app = express();
const FILE_LIMIT = "3GB";

app.use(cors({ origin: true }));
app.use(
bodyParser.text(),
bodyParser.json(),
bodyParser.text({ limit: FILE_LIMIT }),
bodyParser.json({ limit: FILE_LIMIT }),
bodyParser.urlencoded({
limit: FILE_LIMIT,
extended: true,
})
);
Expand Down
3 changes: 2 additions & 1 deletion collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"mime": "^3.0.0",
"moment": "^2.29.4",
"node-html-parser": "^6.1.13",
"node-xlsx": "^0.24.0",
"officeparser": "^4.0.5",
"openai": "4.38.5",
"pdf-parse": "^1.1.1",
Expand All @@ -48,4 +49,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
3 changes: 2 additions & 1 deletion collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ async function scrapeGenericUrl(link, textOnly = false) {
}

const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");
const decodedPathname = decodeURIComponent(url.pathname);
const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;

const data = {
id: v4(),
Expand Down
113 changes: 113 additions & 0 deletions collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
const { v4 } = require("uuid");
const xlsx = require("node-xlsx").default;
const path = require("path");
const fs = require("fs");
const {
createdDate,
trashFile,
writeToServerDocuments,
} = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

function convertToCSV(data) {
return data
.map((row) =>
row
.map((cell) => {
if (cell === null || cell === undefined) return "";
if (typeof cell === "string" && cell.includes(","))
return `"${cell}"`;
return cell;
})
.join(",")
)
.join("\n");
}

async function asXlsx({ fullFilePath = "", filename = "" }) {
const documents = [];
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
lower: true,
trim: true,
});

const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../server/storage/documents/${folderName}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);

try {
const workSheetsFromFile = xlsx.parse(fullFilePath);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });

for (const sheet of workSheetsFromFile) {
try {
const { name, data } = sheet;
const content = convertToCSV(data);

if (!content?.length) {
console.warn(`Sheet "${name}" is empty. Skipping.`);
continue;
}

console.log(`-- Processing sheet: ${name} --`);
const sheetData = {
id: v4(),
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
title: `${filename} - Sheet:${name}`,
docAuthor: "Unknown",
description: `Spreadsheet data from sheet: ${name}`,
docSource: "an xlsx file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};

const document = writeToServerDocuments(
sheetData,
`sheet-${slugify(name)}`,
outFolderPath
);
documents.push(document);
console.log(
`[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
);
} catch (err) {
console.error(`Error processing sheet "${name}":`, err);
continue;
}
}
} catch (err) {
console.error("Could not process xlsx file!", err);
return {
success: false,
reason: `Error processing ${filename}: ${err.message}`,
documents: [],
};
} finally {
trashFile(fullFilePath);
}

if (documents.length === 0) {
console.error(`No valid sheets found in ${filename}.`);
return {
success: false,
reason: `No valid sheets found in ${filename}.`,
documents: [],
};
}

console.log(
`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
);
return { success: true, reason: null, documents };
}

module.exports = asXlsx;
2 changes: 1 addition & 1 deletion collector/processSingleFile/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function processSingleFile(targetFilename, options = {}) {
};

const fileExtension = path.extname(fullFilePath).toLowerCase();
if (!fileExtension) {
if (fullFilePath.includes(".") && !fileExtension) {
return {
success: false,
reason: `No file extension found. This file cannot be processed.`,
Expand Down
6 changes: 6 additions & 0 deletions collector/utils/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ const ACCEPTED_MIMES = {
".pptx",
],

"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [
".xlsx",
],

"application/vnd.oasis.opendocument.text": [".odt"],
"application/vnd.oasis.opendocument.presentation": [".odp"],

Expand Down Expand Up @@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".odt": "./convert/asOfficeMime.js",
".odp": "./convert/asOfficeMime.js",

".xlsx": "./convert/asXlsx.js",

".mbox": "./convert/asMbox.js",

".epub": "./convert/asEPub.js",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,36 @@ class GitHubRepoLoader {
}

#validGithubUrl() {
const UrlPattern = require("url-pattern");
const pattern = new UrlPattern(
"https\\://github.com/(:author)/(:project(*))",
{
// fixes project names with special characters (.github)
segmentValueCharset: "a-zA-Z0-9-._~%/+",
try {
const url = new URL(this.repo);

// Not a github url at all.
if (url.hostname !== "github.com") {
console.log(
`[Github Loader]: Invalid Github URL provided! Hostname must be 'github.com'. Got ${url.hostname}`
);
return false;
}
);
const match = pattern.match(this.repo);
if (!match) return false;

this.author = match.author;
this.project = match.project;
return true;
// Assume the url is in the format of github.com/{author}/{project}
// Remove the first slash from the pathname so we can split it properly.
const [author, project, ..._rest] = url.pathname.slice(1).split("/");
if (!author || !project) {
console.log(
`[Github Loader]: Invalid Github URL provided! URL must be in the format of 'github.com/{author}/{project}'. Got ${url.pathname}`
);
return false;
}

this.author = author;
this.project = project;
return true;
} catch (e) {
console.log(
`[Github Loader]: Invalid Github URL provided! Error: ${e.message}`
);
return false;
}
}

// Ensure the branch provided actually exists
Expand Down
3 changes: 2 additions & 1 deletion collector/utils/extensions/WebsiteDepth/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ async function bulkScrapePages(links, outFolderPath) {
}

const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");
const decodedPathname = decodeURIComponent(url.pathname);
const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;

const data = {
id: v4(),
Expand Down
22 changes: 20 additions & 2 deletions collector/utils/files/mime.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
const MimeLib = require("mime");

const path = require("path");
class MimeDetector {
nonTextTypes = ["multipart", "image", "model", "audio", "video"];
badMimes = [
Expand Down Expand Up @@ -44,8 +44,26 @@ class MimeDetector {
);
}

// These are file types that are not detected by the mime library and need to be processed as text files.
// You should only add file types that are not detected by the mime library, are parsable as text, and are files
// with no extension. Otherwise, their extension should be added to the overrides array.
#specialTextFileTypes = ["dockerfile", "jenkinsfile"];

/**
* Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
* @param {string} filepath
* @returns {string}
*/
getType(filepath) {
return this.lib.getType(filepath);
const parsedMime = this.lib.getType(filepath);
if (!!parsedMime) return parsedMime;

// If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
// which we can reliably process as text files.
const baseName = path.basename(filepath)?.toLowerCase();
if (this.#specialTextFileTypes.includes(baseName)) return "text/plain";

return null;
}
}

Expand Down
11 changes: 11 additions & 0 deletions collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13:
css-select "^5.1.0"
he "1.2.0"

node-xlsx@^0.24.0:
version "0.24.0"
resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f"
integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg==
dependencies:
xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"

[email protected]:
version "6.9.13"
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6"
Expand Down Expand Up @@ -3528,6 +3535,10 @@ [email protected]:
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f"
integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==

"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz":
version "0.20.2"
resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d"

xml2js@^0.6.2:
version "0.6.2"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"
Expand Down
Loading

0 comments on commit a6a5084

Please sign in to comment.