merge master

Mintplex-Labs · Oct 22, 2024 · a6a5084 · a6a5084
2 parents 53e0cd1 + 79ce26d
commit a6a5084
Show file tree

Hide file tree

Showing 98 changed files with 2,917 additions and 839 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -5,6 +5,7 @@
     "AIbitat",
     "allm",
     "anythingllm",
+    "Apipie",
     "Astra",
     "Chartable",
     "cleancss",
@@ -18,6 +19,7 @@
     "elevenlabs",
     "Embeddable",
     "epub",
+    "fireworksai",
     "GROQ",
     "hljs",
     "huggingface",
@@ -40,17 +42,18 @@
     "pagerender",
     "Qdrant",
     "royalblue",
-    "searxng",
     "SearchApi",
+    "searxng",
     "Serper",
     "Serply",
     "streamable",
     "textgenwebui",
     "togetherai",
-    "fireworksai",
     "Unembed",
+    "uuidv",
     "vectordbs",
     "Weaviate",
+    "XAILLM",
     "Zilliz"
   ],
   "eslint.experimental.useFlatConfig": true,

diff --git a/README.md b/README.md
@@ -94,6 +94,8 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
 - [KoboldCPP](https://github.com/LostRuins/koboldcpp)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [Text Generation Web UI](https://github.com/oobabooga/text-generation-webui)
+- [Apipie](https://apipie.ai/)
+- [xAI](https://x.ai/)
 
 **Embedder models:**
 
@@ -116,6 +118,7 @@ AnythingLLM divides your documents into objects called `workspaces`. A Workspace
 - [PiperTTSLocal - runs in browser](https://github.com/rhasspy/piper)
 - [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech/voice-options)
 - [ElevenLabs](https://elevenlabs.io/)
+- Any OpenAI Compatible TTS service.
 
 **STT (speech-to-text) support:**
 

diff --git a/collector/index.js b/collector/index.js
@@ -16,12 +16,14 @@ const extensions = require("./extensions");
 const { processRawText } = require("./processRawText");
 const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
 const app = express();
+const FILE_LIMIT = "3GB";
 
 app.use(cors({ origin: true }));
 app.use(
-  bodyParser.text(),
-  bodyParser.json(),
+  bodyParser.text({ limit: FILE_LIMIT }),
+  bodyParser.json({ limit: FILE_LIMIT }),
   bodyParser.urlencoded({
+    limit: FILE_LIMIT,
     extended: true,
   })
 );

diff --git a/collector/package.json b/collector/package.json
@@ -33,6 +33,7 @@
     "mime": "^3.0.0",
     "moment": "^2.29.4",
     "node-html-parser": "^6.1.13",
+    "node-xlsx": "^0.24.0",
     "officeparser": "^4.0.5",
     "openai": "4.38.5",
     "pdf-parse": "^1.1.1",
@@ -48,4 +49,4 @@
     "nodemon": "^2.0.22",
     "prettier": "^2.4.1"
   }
-}
+}
diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
@@ -27,7 +27,8 @@ async function scrapeGenericUrl(link, textOnly = false) {
   }
 
   const url = new URL(link);
-  const filename = (url.host + "-" + url.pathname).replace(".", "_");
+  const decodedPathname = decodeURIComponent(url.pathname);
+  const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
 
   const data = {
     id: v4(),

diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js
@@ -0,0 +1,113 @@
+const { v4 } = require("uuid");
+const xlsx = require("node-xlsx").default;
+const path = require("path");
+const fs = require("fs");
+const {
+  createdDate,
+  trashFile,
+  writeToServerDocuments,
+} = require("../../utils/files");
+const { tokenizeString } = require("../../utils/tokenizer");
+const { default: slugify } = require("slugify");
+
+function convertToCSV(data) {
+  return data
+    .map((row) =>
+      row
+        .map((cell) => {
+          if (cell === null || cell === undefined) return "";
+          if (typeof cell === "string" && cell.includes(","))
+            return `"${cell}"`;
+          return cell;
+        })
+        .join(",")
+    )
+    .join("\n");
+}
+
+async function asXlsx({ fullFilePath = "", filename = "" }) {
+  const documents = [];
+  const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
+    lower: true,
+    trim: true,
+  });
+
+  const outFolderPath =
+    process.env.NODE_ENV === "development"
+      ? path.resolve(
+          __dirname,
+          `../../../server/storage/documents/${folderName}`
+        )
+      : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);
+
+  try {
+    const workSheetsFromFile = xlsx.parse(fullFilePath);
+    if (!fs.existsSync(outFolderPath))
+      fs.mkdirSync(outFolderPath, { recursive: true });
+
+    for (const sheet of workSheetsFromFile) {
+      try {
+        const { name, data } = sheet;
+        const content = convertToCSV(data);
+
+        if (!content?.length) {
+          console.warn(`Sheet "${name}" is empty. Skipping.`);
+          continue;
+        }
+
+        console.log(`-- Processing sheet: ${name} --`);
+        const sheetData = {
+          id: v4(),
+          url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
+          title: `${filename} - Sheet:${name}`,
+          docAuthor: "Unknown",
+          description: `Spreadsheet data from sheet: ${name}`,
+          docSource: "an xlsx file uploaded by the user.",
+          chunkSource: "",
+          published: createdDate(fullFilePath),
+          wordCount: content.split(/\s+/).length,
+          pageContent: content,
+          token_count_estimate: tokenizeString(content).length,
+        };
+
+        const document = writeToServerDocuments(
+          sheetData,
+          `sheet-${slugify(name)}`,
+          outFolderPath
+        );
+        documents.push(document);
+        console.log(
+          `[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
+        );
+      } catch (err) {
+        console.error(`Error processing sheet "${name}":`, err);
+        continue;
+      }
+    }
+  } catch (err) {
+    console.error("Could not process xlsx file!", err);
+    return {
+      success: false,
+      reason: `Error processing ${filename}: ${err.message}`,
+      documents: [],
+    };
+  } finally {
+    trashFile(fullFilePath);
+  }
+
+  if (documents.length === 0) {
+    console.error(`No valid sheets found in ${filename}.`);
+    return {
+      success: false,
+      reason: `No valid sheets found in ${filename}.`,
+      documents: [],
+    };
+  }
+
+  console.log(
+    `[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
+  );
+  return { success: true, reason: null, documents };
+}
+
+module.exports = asXlsx;
diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js
@@ -38,7 +38,7 @@ async function processSingleFile(targetFilename, options = {}) {
     };
 
   const fileExtension = path.extname(fullFilePath).toLowerCase();
-  if (!fileExtension) {
+  if (fullFilePath.includes(".") && !fileExtension) {
     return {
       success: false,
       reason: `No file extension found. This file cannot be processed.`,

diff --git a/collector/utils/constants.js b/collector/utils/constants.js
@@ -11,6 +11,10 @@ const ACCEPTED_MIMES = {
     ".pptx",
   ],
 
+  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [
+    ".xlsx",
+  ],
+
   "application/vnd.oasis.opendocument.text": [".odt"],
   "application/vnd.oasis.opendocument.presentation": [".odp"],
 
@@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
   ".odt": "./convert/asOfficeMime.js",
   ".odp": "./convert/asOfficeMime.js",
 
+  ".xlsx": "./convert/asXlsx.js",
+
   ".mbox": "./convert/asMbox.js",
 
   ".epub": "./convert/asEPub.js",

diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js
@@ -29,20 +29,36 @@ class GitHubRepoLoader {
   }
 
   #validGithubUrl() {
-    const UrlPattern = require("url-pattern");
-    const pattern = new UrlPattern(
-      "https\\://github.com/(:author)/(:project(*))",
-      {
-        // fixes project names with special characters (.github)
-        segmentValueCharset: "a-zA-Z0-9-._~%/+",
+    try {
+      const url = new URL(this.repo);
+
+      // Not a github url at all.
+      if (url.hostname !== "github.com") {
+        console.log(
+          `[Github Loader]: Invalid Github URL provided! Hostname must be 'github.com'. Got ${url.hostname}`
+        );
+        return false;
       }
-    );
-    const match = pattern.match(this.repo);
-    if (!match) return false;
 
-    this.author = match.author;
-    this.project = match.project;
-    return true;
+      // Assume the url is in the format of github.com/{author}/{project}
+      // Remove the first slash from the pathname so we can split it properly.
+      const [author, project, ..._rest] = url.pathname.slice(1).split("/");
+      if (!author || !project) {
+        console.log(
+          `[Github Loader]: Invalid Github URL provided! URL must be in the format of 'github.com/{author}/{project}'. Got ${url.pathname}`
+        );
+        return false;
+      }
+
+      this.author = author;
+      this.project = project;
+      return true;
+    } catch (e) {
+      console.log(
+        `[Github Loader]: Invalid Github URL provided! Error: ${e.message}`
+      );
+      return false;
+    }
   }
 
   // Ensure the branch provided actually exists

diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
@@ -108,7 +108,8 @@ async function bulkScrapePages(links, outFolderPath) {
       }
 
       const url = new URL(link);
-      const filename = (url.host + "-" + url.pathname).replace(".", "_");
+      const decodedPathname = decodeURIComponent(url.pathname);
+      const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
 
       const data = {
         id: v4(),

diff --git a/collector/utils/files/mime.js b/collector/utils/files/mime.js
@@ -1,5 +1,5 @@
 const MimeLib = require("mime");
-
+const path = require("path");
 class MimeDetector {
   nonTextTypes = ["multipart", "image", "model", "audio", "video"];
   badMimes = [
@@ -44,8 +44,26 @@ class MimeDetector {
     );
   }
 
+  // These are file types that are not detected by the mime library and need to be processed as text files.
+  // You should only add file types that are not detected by the mime library, are parsable as text, and are files
+  // with no extension. Otherwise, their extension should be added to the overrides array.
+  #specialTextFileTypes = ["dockerfile", "jenkinsfile"];
+
+  /**
+   * Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
+   * @param {string} filepath
+   * @returns {string}
+   */
   getType(filepath) {
-    return this.lib.getType(filepath);
+    const parsedMime = this.lib.getType(filepath);
+    if (!!parsedMime) return parsedMime;
+
+    // If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
+    // which we can reliably process as text files.
+    const baseName = path.basename(filepath)?.toLowerCase();
+    if (this.#specialTextFileTypes.includes(baseName)) return "text/plain";
+
+    return null;
   }
 }
 

diff --git a/collector/yarn.lock b/collector/yarn.lock
@@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13:
     css-select "^5.1.0"
     he "1.2.0"
 
+node-xlsx@^0.24.0:
+  version "0.24.0"
+  resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f"
+  integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg==
+  dependencies:
+    xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"
+
 [email protected]:
   version "6.9.13"
   resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6"
@@ -3528,6 +3535,10 @@ [email protected]:
   resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f"
   integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==
 
+"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz":
+  version "0.20.2"
+  resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d"
+
 xml2js@^0.6.2:
   version "0.6.2"
   resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"