vespperhq · Sep 17, 2024
diff --git a/‎.vscode/launch.json
+3-4 b/‎.vscode/launch.json
+3-4
diff --git a/‎TROUBLESHOOTING.md
+4 b/‎TROUBLESHOOTING.md
+4
diff --git a/‎changes.patch
+2,242 b/‎changes.patch
+2,242
diff --git a/‎services/api/src/agent/tools/static/semantic_search.ts
+1 b/‎services/api/src/agent/tools/static/semantic_search.ts
+1
diff --git a/‎services/data-processor/src/build.py
+9-9 b/‎services/data-processor/src/build.py
+9-9
diff --git a/‎services/data-processor/src/loader.py
+96-21 b/‎services/data-processor/src/loader.py
+96-21
diff --git a/‎services/data-processor/src/loaders/confluence.py
+1-1 b/‎services/data-processor/src/loaders/confluence.py
+1-1
diff --git a/‎services/data-processor/src/loaders/github.py
+5-3 b/‎services/data-processor/src/loaders/github.py
+5-3
diff --git a/‎services/data-processor/src/loaders/jira.py
+20-6 b/‎services/data-processor/src/loaders/jira.py
+20-6
diff --git a/‎services/data-processor/src/loaders/notion.py
+1-1 b/‎services/data-processor/src/loaders/notion.py
+1-1
diff --git a/‎services/data-processor/src/loaders/pagerduty.py
+5-76 b/‎services/data-processor/src/loaders/pagerduty.py
+5-76
diff --git a/‎services/data-processor/src/loaders/raw_readers/README.md ‎services/data-processor/src/loaders/readers/README.md b/‎services/data-processor/src/loaders/raw_readers/README.md ‎services/data-processor/src/loaders/readers/README.md
diff --git a/‎services/data-processor/src/loaders/raw_readers/confluence.py ‎services/data-processor/src/loaders/readers/confluence.py
+6-4 b/‎services/data-processor/src/loaders/raw_readers/confluence.py ‎services/data-processor/src/loaders/readers/confluence.py
+6-4
diff --git a/‎services/data-processor/src/loaders/raw_readers/github_issues.py ‎services/data-processor/src/loaders/readers/github_issues.py
+1 b/‎services/data-processor/src/loaders/raw_readers/github_issues.py ‎services/data-processor/src/loaders/readers/github_issues.py
+1
diff --git a/‎services/data-processor/src/loaders/raw_readers/github_repo.py ‎services/data-processor/src/loaders/readers/github_repo.py
+6 b/‎services/data-processor/src/loaders/raw_readers/github_repo.py ‎services/data-processor/src/loaders/readers/github_repo.py
+6
diff --git a/‎services/data-processor/src/loaders/readers/jira.py
+117 b/‎services/data-processor/src/loaders/readers/jira.py
+117
diff --git a/‎services/data-processor/src/loaders/readers/notion.py
+211 b/‎services/data-processor/src/loaders/readers/notion.py
+211
diff --git a/‎services/data-processor/src/loaders/readers/pagerduty.py
+93 b/‎services/data-processor/src/loaders/readers/pagerduty.py
+93
diff --git a/‎services/data-processor/src/loaders/raw_readers/slack.py ‎services/data-processor/src/loaders/readers/slack.py
+58-14 b/‎services/data-processor/src/loaders/raw_readers/slack.py ‎services/data-processor/src/loaders/readers/slack.py
+58-14
diff --git a/‎services/data-processor/src/loaders/slack.py
+1-2 b/‎services/data-processor/src/loaders/slack.py
+1-2
diff --git a/‎services/data-processor/src/loaders/utils/github_client.py
+611 b/‎services/data-processor/src/loaders/utils/github_client.py
+611
diff --git a/‎services/data-processor/src/main.py
+3-5 b/‎services/data-processor/src/main.py
+3-5
diff --git a/‎services/data-processor/src/rag/chromadb.py
+1-1 b/‎services/data-processor/src/rag/chromadb.py
+1-1
diff --git a/‎services/data-processor/src/rag/raw_vector_stores/chromadb.py
+423 b/‎services/data-processor/src/rag/raw_vector_stores/chromadb.py
+423
@@ -8,10 +8,9 @@
       "type": "node",
       "request": "launch",
       "name": "Slackbot: Debug",
-      "cwd": "${workspaceFolder}/services/slackbot",
-      "program": "${workspaceFolder}/node_modules/ts-node/dist/bin.js",
-      "args": ["${workspaceFolder}/services/slackbot/src/app.ts"],
-      "envFile": "${workspaceFolder}/services/slackbot/.env"
+      "cwd": "${workspaceFolder}",
+      "program": "${workspaceFolder}/node_modules/.bin/nx",
+      "args": ["dev", "slackbot"]
     },
     {
       "type": "node",
 
@@ -26,6 +26,10 @@ This error usually happens when the Slack keys (`SLACK_BOT_TOKEN`, `SLACK_APP_TO
 
 If they are correct, try to restart the `slackbot` service by running `docker compose up slackbot -d`. Sometimes users update `.env` but do not restart the service itself, which causing it to take out-dated variables.
 
+### `429: Too Many Requests`
+
+This error might appear in LiteLLM's logs. It's a bit misleading, and most of the times it means you don't have enough credits left. Go to your LLM provider (OpenAI, Anthropic, etc.) and check your credits.
+
 ### Environment variabels are out-dated
 
 If you use VSC Code, sometimes it loads environment variables from the `.env` file automatically. In most cases, it happens because of the python extension. In our `settings.json`, we set `"python.envFile": ""` which shoud prevent that. However, if that doesn't work, try to run the project from a separate terminal (not VS Code).
@@ -88,6 +88,7 @@ export default async function (context: RunContext) {
                   title = "PagerDuty Alert";
                   break;
                 }
+                case "Jira":
                 case "Confluence": {
                   url = document.metadata.url;
                   title = document.metadata.title;
 
@@ -42,29 +42,29 @@ async def build_index(
 
         store = get_vector_store(index_name, index_type)
 
-        try:
-            if await store.is_index_live():
-                print("Index exists. Delete old one...")
-                await store.delete_index()
-        except Exception as e:
-            print("Could not delete index", e)
-            print("Trying to move forward")
-        await store.create_index()
+        if not await store.is_index_live():
+            await store.create_index()
 
         async def update_status(vendor_name: str, status: str):
             await db.index.update_one(
                 {"_id": index_id},
                 {"$set": {f"state.integrations.{vendor_name}": status}},
             )
 
+        vector_store = store.get_llama_index_store()
         documents, stats = await get_documents(
+            index=index,
+            vector_store=vector_store,
             organization_id=organization_id,
             data_sources=data_sources,
             on_progress=partial(update_status, status="in_progress"),
             on_complete=partial(update_status, status="completed"),
         )
+        # Delete nodes of documents that are about to be re-indexed
+        if len(documents) > 0:
+            docs_to_delete = list(set([document.ref_doc_id for document in documents]))
+            vector_store.delete(ref_doc_id=docs_to_delete)
 
-        vector_store = store.get_llama_index_store()
         storage_context = StorageContext.from_defaults(vector_store=vector_store)
         embed_model = LiteLLMEmbedding(
             api_base=litellm_url,
 
@@ -1,6 +1,7 @@
 import os
 import asyncio
-from typing import List, Optional
+from typing import List, Optional, Any
+from dateutil import parser
 import numpy as np
 from tqdm.auto import tqdm
 from db.integrations import get_integrations_by_organization_id, populate_secrets
@@ -11,11 +12,86 @@
     Settings,
 )
 
+from llama_index.core.schema import Document
+from llama_index.core.vector_stores.types import (
+    BasePydanticVectorStore,
+    VectorStoreQuery,
+    MetadataFilters,
+    MetadataFilter,
+    FilterOperator,
+)
+
+
+async def filter_unchanged_documents(
+    vector_store: BasePydanticVectorStore,
+    documents: List[Document],
+):
+    # Create a dictionary to group documents by ref_doc_id
+    document_ids = [document.doc_id for document in documents]
+
+    result = vector_store.query(
+        VectorStoreQuery(
+            similarity_top_k=100000000000,  # TODO: This is a hack to make sure we get all the documents
+            filters=MetadataFilters(
+                filters=[
+                    MetadataFilter(
+                        key="ref_doc_id",
+                        value=document_ids,
+                        operator=FilterOperator.IN,
+                    )
+                ]
+            ),
+        )
+    )
+    db_nodes = result.nodes
+    if len(db_nodes) == 0:
+        return [], [], documents
+
+    db_nodes_groups = {}
+    for db_node in db_nodes:
+        ref_doc_id = db_node.ref_doc_id
+        if ref_doc_id not in db_nodes_groups:
+            db_nodes_groups[ref_doc_id] = []
+        db_nodes_groups[ref_doc_id].append(db_node)
+
+    new_documents = []
+    unchanged_documents = []
+    changed_documents = []
+
+    for document in documents:
+        # At the moment, if the document doesn't have an updated_at, we re-index it
+        if not document.metadata.get("updated_at"):
+            changed_documents.append(document)
+            continue
+
+        document_id = document.doc_id
+        document_nodes = db_nodes_groups.get(document_id, [])
+        if len(document_nodes) > 0:
+            document_timestamp = parser.isoparse(document.metadata["updated_at"])
+            node_timestamp = parser.isoparse(document_nodes[0].metadata["updated_at"])
+
+            # If the document's updated_at date is greater than the node's updated_at date, it means
+            # the document has been updated, so we need to re-index it
+            if document_timestamp > node_timestamp:
+                changed_documents.append(document)
+            else:
+                unchanged_documents.append(document)
+        else:
+            new_documents.append(document)
+
+    print(f"Found {len(changed_documents)} changed documents")
+    print(f"Found {len(unchanged_documents)} unchanged documents")
+    print(f"Found {len(new_documents)} new documents")
+
+    return changed_documents, unchanged_documents, new_documents
+
 
 async def get_documents(
+    index: Any,
+    vector_store: BasePydanticVectorStore,
     organization_id: str,
     data_sources: Optional[List[str]] = None,
-    total_limit: Optional[int] = 10000,
+    total_limit: Optional[int] = 10000,  # unused at the moment
     on_progress: Optional[callable] = None,
     on_complete: Optional[callable] = None,
 ):
@@ -30,14 +106,11 @@ async def get_documents(
     vendor_names = [integration.vendor.name for integration in integrations]
     print(f"Found {len(integrations)} integrations: {vendor_names}")
 
-    # Calculate the limit per source
-    limit_per_source = round(total_limit / len(integrations))
-
     stats = {}
-    documents = []
-
-    # Settings.transformations
+    total_nodes = []
     progress_bar = tqdm(integrations)
+    n_existing_nodes = index.get("stats") and sum(index["stats"].values()) or 0
+
     for integration in progress_bar:
         vendor_name = integration.vendor.name
         if on_progress:
@@ -52,9 +125,13 @@ async def get_documents(
         # Loader might be an async code, so we need to await it
         try:
             if asyncio.iscoroutinefunction(loader):
-                docs = await loader(integration)
+                raw_docs = await loader(integration)
             else:
-                docs = loader(integration)
+                raw_docs = loader(integration)
+
+            changed_documents, unchanged_documents, new_documents = (
+                await filter_unchanged_documents(vector_store, raw_docs)
+            )
         except Exception as e:
             print(f"Could not load {vendor_name}. Error: {e}")
             continue
@@ -66,19 +143,17 @@ async def get_documents(
         num_cpus = os.cpu_count()
         num_workers = min(4, num_cpus) if num_cpus > 1 else 1
 
-        # counts = [len(doc.text) for doc in docs]
-        # limit = np.percentile(counts, [99])[0]
-        # docs = [doc for doc in docs if len(doc.text) < limit]
-        docs = pipeline.run(documents=docs, num_workers=num_workers)
-
-        # Limit the number of documents per source
-        # docs = docs[:limit_per_source]
+        new_nodes = pipeline.run(documents=new_documents, num_workers=num_workers)
+        changed_nodes = pipeline.run(
+            documents=changed_documents, num_workers=num_workers
+        )
+        nodes = new_nodes + changed_nodes
 
-        print(f"Found {len(docs)} documents for {vendor_name}")
-        documents.extend(docs)
-        stats[integration.vendor.name] = len(docs)
+        print(f"Found total of {len(raw_docs)} documents for {vendor_name}")
+        total_nodes.extend(nodes)
+        stats[integration.vendor.name] = n_existing_nodes + len(new_nodes)
 
         if on_complete:
             await on_complete(vendor_name)
 
-    return documents, stats
+    return total_nodes, stats
@@ -1,7 +1,7 @@
 from collections import namedtuple
 import os
 import requests
-from loaders.raw_readers.confluence import ConfluenceReader
+from loaders.readers.confluence import ConfluenceReader
 from atlassian import Confluence
 
 from db.types import Integration
 
@@ -2,13 +2,13 @@
 from github import Github, Auth, GithubException
 
 # from llama_index.core import SimpleDirectoryReader
-from llama_index.readers.github.repository.github_client import GithubClient
+from loaders.utils.github_client import GithubClient
 from llama_index.readers.github import (
     GitHubIssuesClient,
 )
 from db.types import Integration
-from loaders.raw_readers.github_repo import GithubRepositoryReader
-from loaders.raw_readers.github_issues import GitHubRepositoryIssuesReader
+from loaders.readers.github_repo import GithubRepositoryReader
+from loaders.readers.github_issues import GitHubRepositoryIssuesReader
 
 
 def get_repos(token: str, repos_to_sync=None):
@@ -70,6 +70,8 @@ async def fetch_github_documents(
             # # TODO: this can crash if the repo is huge, because of Github API Rate limit.
             # # Need to find a way to "wait" maybe or to filter garbage.
             code_client = GithubClient(token, fail_on_http_error=False, verbose=True)
+
+            # TODO: updated_at timestamp doesn't seem to work (our code treats same docs as new)
             loader = GithubRepositoryReader(
                 github_client=code_client,
                 owner=owner,
 
@@ -1,7 +1,11 @@
 import requests
-from llama_index.readers.jira import JiraReader
+from datetime import datetime, timezone
+from dateutil import parser
+from loaders.readers.jira import JiraReader
 from db.types import Integration
 
+JQL_QUERY = "issuetype is not EMPTY"
+
 
 def fetch_jira_documents(integration: Integration):
     integration_type = integration.type
@@ -19,9 +23,7 @@ def fetch_jira_documents(integration: Integration):
             loader = JiraReader(
                 Oauth2={"cloud_id": cloud_id, "api_token": access_token}
             )
-            documents = loader.load_data(
-                "issuetype is not EMPTY"
-            )  # This "should" fetch all issues
+            documents = loader.load_data(JQL_QUERY)  # This "should" fetch all issues
             total_documents.extend(documents)
     else:
         loader = JiraReader(
@@ -31,12 +33,24 @@ def fetch_jira_documents(integration: Integration):
                 "server_url": integration.metadata["site_url"],
             }
         )
-        documents = loader.load_data("issuetype is not EMPTY")
+        documents = loader.load_data(JQL_QUERY)
         total_documents.extend(documents)
 
     # Adding the global "source" metadata field
     for document in total_documents:
         document.metadata.pop("labels", None)
         document.metadata["source"] = "Jira"
 
-    return documents
+        # Transform 'created_at' and 'updated_at' to UTC with milliseconds
+        created_at = parser.isoparse(document.metadata["created_at"])
+        updated_at = parser.isoparse(document.metadata["updated_at"])
+        document.metadata["created_at"] = (
+            created_at.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]
+            + "Z"
+        )
+        document.metadata["updated_at"] = (
+            updated_at.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]
+            + "Z"
+        )
+
+    return total_documents
@@ -1,6 +1,6 @@
 from db.types import Integration
 from notion_client import Client
-from llama_index.readers.notion import NotionPageReader
+from loaders.readers.notion import NotionPageReader
 
 
 def fetch_notion_documents(integration: Integration):
 
@@ -1,82 +1,11 @@
 from db.types import Integration
-import httpx
-from llama_index.core import Document
-
-INCIDENT_TEXT_TEMPLATE = """
-Incident title: {title}
-Incident description: {description}
-Incident summary: {summary}
-Incident status: {status}
-Service name: {service_name}
-Created at: {created_at}
-"""
-
-
-async def get_incidents(integration: Integration):
-    access_token = integration.credentials["access_token"]
-    integration_type = integration.type
-    headers = {}
-    if integration_type == "basic":
-        headers["Authorization"] = f"Token token={access_token}"
-    elif integration_type == "oauth":
-        headers["Authorization"] = f"Bearer {access_token}"
-    else:
-        raise ValueError(f"Invalid integration type: {integration_type}")
-
-    limit = 100
-    offset = 0
-    resolved_incidents = []
-    while True:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                "https://api.pagerduty.com/incidents",
-                headers=headers,
-                params={
-                    "date_range": "all",
-                    "statuses[]": "resolved",
-                    "limit": limit,
-                    "offset": offset,
-                },
-            )
-            data = response.json()
-            incidents = data["incidents"]
-            resolved_incidents.extend(incidents)
-            if not data["more"]:
-                break
-            offset += limit
-    return resolved_incidents
+from loaders.readers.pagerduty import PagerDutyReader
 
 
 async def fetch_pagerduty_documents(integration: Integration):
-    incidents = await get_incidents(integration)
-
-    documents = []
-    for incident in incidents:
-        service = incident.get("service", {})
-        service_name = service.get("summary", "Unknown")
-
-        text = INCIDENT_TEXT_TEMPLATE.format(
-            title=incident["title"],
-            description=incident["description"],
-            summary=incident["summary"],
-            status=incident["status"],
-            service_name=service_name,
-            created_at=incident["created_at"],
-        )
-        metadata = {
-            "source": "PagerDuty",
-            "id": incident["id"],
-            "link": incident["html_url"],
-            "status": incident["status"],
-            "urgency": incident["urgency"],
-            "service_id": service.get("id", "Unknown"),
-            "first_trigger_log_entry_id": incident.get(
-                "first_trigger_log_entry", {}
-            ).get("id", "Unknown"),
-            "created_at": incident["created_at"],
-        }
-
-        document = Document(doc_id=incident["id"], text=text, metadata=metadata)
-        documents.append(document)
+    access_token = integration.credentials["access_token"]
+    token_type = integration.type
+    loader = PagerDutyReader(access_token, token_type)
+    documents = await loader.load_data()
 
     return documents
@@ -163,6 +163,7 @@ def load_data(
         if not start:
             start = 0
 
+        expand = "body.export_view.value,version"
         pages: List = []
         if space_key:
             pages.extend(
@@ -172,7 +173,7 @@ def load_data(
                     max_num_results=max_num_results,
                     space=space_key,
                     status=page_status,
-                    expand="body.export_view.value",
+                    expand=expand,
                     content_type="page",
                 )
             )
@@ -183,7 +184,7 @@ def load_data(
                     cursor=cursor,
                     cql=f'type="page" AND label="{label}"',
                     max_num_results=max_num_results,
-                    expand="body.export_view.value",
+                    expand=expand,
                 )
             )
         elif cql:
@@ -193,7 +194,7 @@ def load_data(
                     cursor=cursor,
                     cql=cql,
                     max_num_results=max_num_results,
-                    expand="body.export_view.value",
+                    expand=expand,
                 )
             )
         elif page_ids:
@@ -217,7 +218,7 @@ def load_data(
                     self._get_data_with_retry(
                         self.confluence.get_page_by_id,
                         page_id=page_id,
-                        expand="body.export_view.value",
+                        expand=expand,
                     )
                 )
 
@@ -342,6 +343,7 @@ def process_page(self, page, include_attachments, text_maker):
                 "page_id": page["id"],
                 "status": page["status"],
                 "url": self.base_url + page["_links"]["webui"],
+                "updated_at": page["version"]["when"],
             },
         )
 
 
@@ -183,6 +183,7 @@ async def load_data(
                 extra_info = {
                     "state": issue["state"],
                     "created_at": issue["created_at"],
+                    "updated_at": issue["updated_at"],
                     # url is the API URL
                     "url": issue["url"],
                     # source is the HTML URL, more convenient for humans
 
@@ -446,6 +446,10 @@ async def _recurse_tree(
             )
         return blobs_and_full_paths
 
+    async def _get_latest_commit(self, path) -> str:
+        commits = await self._github_client.get_commits(self._owner, self._repo, path)
+        return commits[0]
+
     async def _generate_documents(
         self,
         blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]],
@@ -472,6 +476,7 @@ async def _generate_documents(
         documents = []
         async for blob_data, full_path in buffered_iterator:
             print_if_verbose(self._verbose, f"generating document for {full_path}")
+            latest_commit = await self._get_latest_commit(full_path)
             assert (
                 blob_data.encoding == "base64"
             ), f"blob encoding {blob_data.encoding} not supported"
@@ -525,6 +530,7 @@ async def _generate_documents(
                     "file_path": full_path,
                     "file_name": full_path.split("/")[-1],
                     "url": url,
+                    "updated_at": latest_commit.commit.author.date,
                 },
             )
             documents.append(document)
 
@@ -0,0 +1,117 @@
+from typing import List, Optional, TypedDict
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class BasicAuth(TypedDict):
+    email: str
+    api_token: str
+    server_url: str
+
+
+class Oauth2(TypedDict):
+    cloud_id: str
+    api_token: str
+
+
+class JiraReader(BaseReader):
+    """Jira reader. Reads data from Jira issues from passed query.
+
+    Args:
+        Optional basic_auth:{
+            "email": "email",
+            "api_token": "token",
+            "server_url": "server_url"
+        }
+        Optional oauth:{
+            "cloud_id": "cloud_id",
+            "api_token": "token"
+        }
+    """
+
+    def __init__(
+        self,
+        email: Optional[str] = None,
+        api_token: Optional[str] = None,
+        server_url: Optional[str] = None,
+        BasicAuth: Optional[BasicAuth] = None,
+        Oauth2: Optional[Oauth2] = None,
+    ) -> None:
+        from jira import JIRA
+
+        if email and api_token and server_url:
+            if BasicAuth is None:
+                BasicAuth = {}
+            BasicAuth["email"] = email
+            BasicAuth["api_token"] = api_token
+            BasicAuth["server_url"] = server_url
+
+        if Oauth2:
+            options = {
+                "server": f"https://api.atlassian.com/ex/jira/{Oauth2['cloud_id']}",
+                "headers": {"Authorization": f"Bearer {Oauth2['api_token']}"},
+            }
+            self.jira = JIRA(options=options)
+        else:
+            self.jira = JIRA(
+                basic_auth=(BasicAuth["email"], BasicAuth["api_token"]),
+                server=f"https://{BasicAuth['server_url']}",
+            )
+
+    def load_data(self, query: str) -> List[Document]:
+        relevant_issues = self.jira.search_issues(query)
+
+        issues = []
+
+        assignee = ""
+        reporter = ""
+        epic_key = ""
+        epic_summary = ""
+        epic_descripton = ""
+
+        for issue in relevant_issues:
+            # Iterates through only issues and not epics
+            if "parent" in (issue.raw["fields"]):
+                if issue.fields.assignee:
+                    assignee = issue.fields.assignee.displayName
+
+                if issue.fields.reporter:
+                    reporter = issue.fields.reporter.displayName
+
+                if issue.raw["fields"]["parent"]["key"]:
+                    epic_key = issue.raw["fields"]["parent"]["key"]
+
+                if issue.raw["fields"]["parent"]["fields"]["summary"]:
+                    epic_summary = issue.raw["fields"]["parent"]["fields"]["summary"]
+
+                if issue.raw["fields"]["parent"]["fields"]["status"]["description"]:
+                    epic_descripton = issue.raw["fields"]["parent"]["fields"]["status"][
+                        "description"
+                    ]
+
+            issues.append(
+                Document(
+                    text=f"{issue.fields.summary} \n {issue.fields.description}",
+                    doc_id=issue.id,
+                    extra_info={
+                        "id": issue.id,
+                        "title": issue.fields.summary,
+                        "url": issue.permalink(),
+                        "created_at": issue.fields.created,
+                        "updated_at": issue.fields.updated,
+                        "labels": issue.fields.labels,
+                        "status": issue.fields.status.name,
+                        "assignee": assignee,
+                        "reporter": reporter,
+                        "project": issue.fields.project.name,
+                        "issue_type": issue.fields.issuetype.name,
+                        "priority": issue.fields.priority.name,
+                        "epic_key": epic_key,
+                        "epic_summary": epic_summary,
+                        "epic_description": epic_descripton,
+                    },
+                )
+            )
+
+        return issues
@@ -0,0 +1,211 @@
+"""Notion reader."""
+
+from datetime import datetime
+import os
+from typing import Any, Dict, List, Optional
+
+import requests  # type: ignore
+from llama_index.core.readers.base import BasePydanticReader
+from llama_index.core.schema import Document
+
+INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN"
+BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
+DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
+SEARCH_URL = "https://api.notion.com/v1/search"
+
+
+def utc_to_iso(utc_time: str) -> datetime:
+    return datetime.fromisoformat(utc_time.replace("Z", "+00:00"))
+
+
+# TODO: Notion DB reader coming soon!
+class NotionPageReader(BasePydanticReader):
+    """Notion Page reader.
+
+    Reads a set of Notion pages.
+
+    Args:
+        integration_token (str): Notion integration token.
+
+    """
+
+    is_remote: bool = True
+    token: str
+    headers: Dict[str, str]
+
+    def __init__(self, integration_token: Optional[str] = None) -> None:
+        """Initialize with parameters."""
+        if integration_token is None:
+            integration_token = os.getenv(INTEGRATION_TOKEN_NAME)
+            if integration_token is None:
+                raise ValueError(
+                    "Must specify `integration_token` or set environment "
+                    "variable `NOTION_INTEGRATION_TOKEN`."
+                )
+
+        token = integration_token
+        headers = {
+            "Authorization": "Bearer " + token,
+            "Content-Type": "application/json",
+            "Notion-Version": "2022-06-28",
+        }
+
+        super().__init__(token=token, headers=headers)
+
+    @classmethod
+    def class_name(cls) -> str:
+        """Get the name identifier of the class."""
+        return "NotionPageReader"
+
+    def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
+        """Read a block."""
+        done = False
+        result_lines_arr = []
+        cur_block_id = block_id
+        most_recent_time = None
+        while not done:
+            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
+            query_dict: Dict[str, Any] = {}
+
+            res = requests.request(
+                "GET", block_url, headers=self.headers, json=query_dict
+            )
+            data = res.json()
+
+            for result in data["results"]:
+                result_type = result["type"]
+                result_obj = result[result_type]
+
+                cur_result_text_arr = []
+                if "rich_text" in result_obj:
+                    for rich_text in result_obj["rich_text"]:
+                        # skip if doesn't have text object
+                        if "text" in rich_text:
+                            text = rich_text["text"]["content"]
+                            prefix = "\t" * num_tabs
+                            cur_result_text_arr.append(prefix + text)
+
+                result_block_id = result["id"]
+                has_children = result["has_children"]
+                if has_children:
+                    children_text, _ = self._read_block(
+                        result_block_id, num_tabs=num_tabs + 1
+                    )
+                    cur_result_text_arr.append(children_text)
+
+                cur_result_text = "\n".join(cur_result_text_arr)
+                result_lines_arr.append(cur_result_text)
+                last_edited_time = result["last_edited_time"]
+
+                if most_recent_time is None or utc_to_iso(
+                    last_edited_time
+                ) > utc_to_iso(most_recent_time):
+                    most_recent_time = last_edited_time
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                cur_block_id = data["next_cursor"]
+
+        block_text = "\n".join(result_lines_arr)
+
+        return block_text, most_recent_time
+
+    def read_page(self, page_id: str) -> str:
+        """Read a page."""
+        return self._read_block(page_id)
+
+    def query_database(
+        self, database_id: str, query_dict: Dict[str, Any] = {"page_size": 100}
+    ) -> List[str]:
+        """Get all the pages from a Notion database."""
+        pages = []
+
+        res = requests.post(
+            DATABASE_URL_TMPL.format(database_id=database_id),
+            headers=self.headers,
+            json=query_dict,
+        )
+        res.raise_for_status()
+        data = res.json()
+
+        pages.extend(data.get("results"))
+
+        while data.get("has_more"):
+            query_dict["start_cursor"] = data.get("next_cursor")
+            res = requests.post(
+                DATABASE_URL_TMPL.format(database_id=database_id),
+                headers=self.headers,
+                json=query_dict,
+            )
+            res.raise_for_status()
+            data = res.json()
+            pages.extend(data.get("results"))
+
+        return [page["id"] for page in pages]
+
+    def search(self, query: str) -> List[str]:
+        """Search Notion page given a text query."""
+        done = False
+        next_cursor: Optional[str] = None
+        page_ids = []
+        while not done:
+            query_dict = {
+                "query": query,
+            }
+            if next_cursor is not None:
+                query_dict["start_cursor"] = next_cursor
+            res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict)
+            data = res.json()
+            for result in data["results"]:
+                page_id = result["id"]
+                page_ids.append(page_id)
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                next_cursor = data["next_cursor"]
+        return page_ids
+
+    def load_data(
+        self, page_ids: List[str] = [], database_id: Optional[str] = None
+    ) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            page_ids (List[str]): List of page ids to load.
+            database_id (str): Database_id from which to load page ids.
+
+        Returns:
+            List[Document]: List of documents.
+
+        """
+        if not page_ids and not database_id:
+            raise ValueError("Must specify either `page_ids` or `database_id`.")
+        docs = []
+        if database_id is not None:
+            # get all the pages in the database
+            page_ids = self.query_database(database_id)
+            for page_id in page_ids:
+                page_text, most_recent_time = self.read_page(page_id)
+                docs.append(
+                    Document(
+                        text=page_text,
+                        id_=page_id,
+                        extra_info={"page_id": page_id, "updated_at": most_recent_time},
+                    )
+                )
+        else:
+            for page_id in page_ids:
+                page_text, most_recent_time = self.read_page(page_id)
+                docs.append(
+                    Document(
+                        text=page_text,
+                        id_=page_id,
+                        extra_info={"page_id": page_id, "updated_at": most_recent_time},
+                    )
+                )
+
+        return docs
@@ -0,0 +1,93 @@
+import httpx
+from typing import List
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+INCIDENT_TEXT_TEMPLATE = """
+Incident title: {title}
+Incident description: {description}
+Incident summary: {summary}
+Incident status: {status}
+Service name: {service_name}
+Created at: {created_at}
+"""
+
+
+class PagerDutyReader(BaseReader):
+    access_token: str
+    token_type: str
+
+    def __init__(self, access_token: str, token_type: str):
+        self.access_token = access_token
+        self.token_type = token_type
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "PagerDutyReader"
+
+    async def get_incidents(self) -> List[Document]:
+        headers = {}
+        if self.token_type == "basic":
+            headers["Authorization"] = f"Token token={self.access_token}"
+        elif self.token_type == "oauth":
+            headers["Authorization"] = f"Bearer {self.access_token}"
+
+        limit = 100
+        offset = 0
+        resolved_incidents = []
+        while True:
+            async with httpx.AsyncClient() as client:
+                response = await client.get(
+                    "https://api.pagerduty.com/incidents",
+                    headers=headers,
+                    params={
+                        "date_range": "all",
+                        "statuses[]": "resolved",
+                        "limit": limit,
+                        "offset": offset,
+                    },
+                )
+                data = response.json()
+                incidents = data["incidents"]
+                resolved_incidents.extend(incidents)
+                if not data["more"]:
+                    break
+                offset += limit
+        return resolved_incidents
+
+    async def load_data(self) -> List[Document]:
+        incidents = await self.get_incidents()
+
+        documents = []
+
+        for incident in incidents:
+            service = incident.get("service", {})
+            service_name = service.get("summary", "Unknown")
+
+            text = INCIDENT_TEXT_TEMPLATE.format(
+                title=incident["title"],
+                description=incident["description"],
+                summary=incident["summary"],
+                status=incident["status"],
+                service_name=service_name,
+                created_at=incident["created_at"],
+            )
+            metadata = {
+                "source": "PagerDuty",
+                "title": incident["title"],
+                "id": incident["id"],
+                "link": incident["html_url"],
+                "status": incident["status"],
+                "urgency": incident["urgency"],
+                "service_id": service.get("id", "Unknown"),
+                "first_trigger_log_entry_id": incident.get(
+                    "first_trigger_log_entry", {}
+                ).get("id", "Unknown"),
+                "created_at": incident["created_at"],
+                "updated_at": incident["updated_at"],
+            }
+
+            document = Document(doc_id=incident["id"], text=text, metadata=metadata)
+            documents.append(document)
+
+        return documents
@@ -103,8 +103,11 @@ def _read_message(self, channel_id: str, message_ts: str) -> str:
 
         """Read a message."""
 
+        # TODO: this method reads all the thread messages and creates one document
+        # At the moment, we don't use the usernames + timestamps. This can be a nice improvement.
         messages_text: List[str] = []
         next_cursor = None
+        most_recent_update = None
         while True:
             try:
                 # https://slack.com/api/conversations.replies
@@ -128,6 +131,18 @@ def _read_message(self, channel_id: str, message_ts: str) -> str:
                         **conversations_replies_kwargs  # type: ignore
                     )
                 messages = result["messages"]
+
+                for message in messages:
+                    last_edited = float(
+                        message.get("edited", {}).get("ts", message["ts"])
+                    )
+                    last_edited_utc = datetime.utcfromtimestamp(last_edited)
+                    if (
+                        most_recent_update is None
+                        or last_edited_utc > most_recent_update
+                    ):
+                        most_recent_update = last_edited_utc
+
                 messages_text.extend(message["text"] for message in messages)
                 if not result["has_more"]:
                     break
@@ -143,7 +158,10 @@ def _read_message(self, channel_id: str, message_ts: str) -> str:
                     time.sleep(int(e.response.headers["retry-after"]))
                 else:
                     logger.error(f"Error parsing conversation replies: {e}")
-        return "\n\n".join(messages_text)
+
+        most_recent_update = most_recent_update.isoformat(timespec="milliseconds") + "Z"
+
+        return ("\n\n".join(messages_text), most_recent_update)
 
     def _read_channel(self, channel_id: str, reverse_chronological: bool) -> str:
         from slack_sdk.errors import SlackApiError
@@ -162,6 +180,7 @@ def _read_channel(self, channel_id: str, reverse_chronological: bool) -> str:
                     "channel": channel_id,
                     "cursor": next_cursor,
                     "latest": str(self.latest_date_timestamp),
+                    "include_all_metadata": True,
                 }
                 if self.earliest_date_timestamp is not None:
                     conversations_history_kwargs["oldest"] = str(
@@ -175,18 +194,34 @@ def _read_channel(self, channel_id: str, reverse_chronological: bool) -> str:
                 logger.info(
                     f"{len(conversation_history)} messages found in {channel_id}"
                 )
-                result_messages.extend(
-                    (
-                        {
-                            **message,
-                            "text": self._read_message(channel_id, message["ts"]),
-                        }
-                        if message.get("thread_ts")
-                        == message["ts"]  # Message is a parent message of a thread
-                        else message
-                    )
-                    for message in tqdm(conversation_history, desc="Reading messages")
-                )
+
+                for message in tqdm(conversation_history, desc="Reading messages"):
+                    if message.get("thread_ts") == message["ts"]:
+                        # Message is a thread parent message. Let's explore this thread!
+                        text, most_recent_update = self._read_message(
+                            channel_id, message["ts"]
+                        )
+                        result_messages.append(
+                            {
+                                **message,
+                                "text": text,
+                                "updated_at": most_recent_update,
+                            }
+                        )
+                    else:
+                        last_edited = float(
+                            message.get("edited", {}).get("ts", message["ts"])
+                        )
+                        result_messages.append(
+                            {
+                                **message,
+                                "updated_at": datetime.utcfromtimestamp(
+                                    last_edited
+                                ).isoformat(timespec="milliseconds")
+                                + "Z",
+                            }
+                        )
+
                 if not result["has_more"]:
                     break
                 next_cursor = result["response_metadata"]["next_cursor"]
@@ -222,11 +257,20 @@ def load_data(
             )
             # Remove messages with empty text
             messages = [message for message in messages if message["text"] != ""]
+            # debugging step
+            for message in messages:
+                if "glorious poop" in message["text"]:
+                    print("this is it boys")
+
             documents = [
                 Document(
                     id_=message["ts"],
                     text=message["text"],
-                    metadata={"channel_id": channel_id, "ts": message["ts"]},
+                    metadata={
+                        "channel_id": channel_id,
+                        "ts": message["ts"],
+                        "updated_at": message["updated_at"],
+                    },
                 )
                 for message in messages
             ]
 
@@ -6,7 +6,7 @@
 
 from typing import List
 
-from loaders.raw_readers.slack import SlackReader
+from loaders.readers.slack import SlackReader
 
 
 def join_channels(client: WebClient, channel_ids: List[str]):
@@ -27,7 +27,6 @@ def fetch_slack_documents(integration: Integration):
         types=["public_channel", "private_channel"],
     )
     channel_ids = [channel["id"] for channel in channels["channels"]]
-
     id2name = {channel["id"]: channel["name"] for channel in channels["channels"]}
 
     # Try to join the channels, to avoid "not_in_channel" in Slack.
 
@@ -49,11 +49,9 @@ async def start_build_index(
 
     # TODO: we re-create the index every time. We need to consider
     # changing this in the future
-    existing_index = await get_index_by_organization_id(organization_id)
-    if existing_index:
-        await delete_index_by_id(existing_index["_id"])
-
-    index = await create_index(organization_id, data_sources, "chromadb")
+    index = await get_index_by_organization_id(organization_id)
+    if not index:
+        index = await create_index(organization_id, data_sources, "chromadb")
 
     background_tasks.add_task(
         build_index,
 
@@ -1,6 +1,6 @@
 import chromadb
 from chromadb.config import Settings
-from llama_index.vector_stores.chroma import (
+from .raw_vector_stores.chromadb import (
     ChromaVectorStore as LIChromaVectorStore,
 )
 from rag.base import BaseVectorStore
Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@ export default async function (context: RunContext) {`
`88`	`88`	`title = "PagerDuty Alert";`
`89`	`89`	`break;`
`90`	`90`	`}`
	`91`	`+ case "Jira":`
`91`	`92`	`case "Confluence": {`
`92`	`93`	`url = document.metadata.url;`
`93`	`94`	`title = document.metadata.title;`
Original file line number	Diff line number	Diff line change
`@@ -163,6 +163,7 @@ def load_data(`
`163`	`163`	`if not start:`
`164`	`164`	`start = 0`
`165`	`165`
	`166`	`+ expand = "body.export_view.value,version"`
`166`	`167`	`pages: List = []`
`167`	`168`	`if space_key:`
`168`	`169`	`pages.extend(`
`@@ -172,7 +173,7 @@ def load_data(`
`172`	`173`	`max_num_results=max_num_results,`
`173`	`174`	`space=space_key,`
`174`	`175`	`status=page_status,`
`175`		`- expand="body.export_view.value",`
	`176`	`+ expand=expand,`
`176`	`177`	`content_type="page",`
`177`	`178`	`)`
`178`	`179`	`)`
`@@ -183,7 +184,7 @@ def load_data(`
`183`	`184`	`cursor=cursor,`
`184`	`185`	`cql=f'type="page" AND label="{label}"',`
`185`	`186`	`max_num_results=max_num_results,`
`186`		`- expand="body.export_view.value",`
	`187`	`+ expand=expand,`
`187`	`188`	`)`
`188`	`189`	`)`
`189`	`190`	`elif cql:`
`@@ -193,7 +194,7 @@ def load_data(`
`193`	`194`	`cursor=cursor,`
`194`	`195`	`cql=cql,`
`195`	`196`	`max_num_results=max_num_results,`
`196`		`- expand="body.export_view.value",`
	`197`	`+ expand=expand,`
`197`	`198`	`)`
`198`	`199`	`)`
`199`	`200`	`elif page_ids:`
`@@ -217,7 +218,7 @@ def load_data(`
`217`	`218`	`self._get_data_with_retry(`
`218`	`219`	`self.confluence.get_page_by_id,`
`219`	`220`	`page_id=page_id,`
`220`		`- expand="body.export_view.value",`
	`221`	`+ expand=expand,`
`221`	`222`	`)`
`222`	`223`	`)`
`223`	`224`
`@@ -342,6 +343,7 @@ def process_page(self, page, include_attachments, text_maker):`
`342`	`343`	`"page_id": page["id"],`
`343`	`344`	`"status": page["status"],`
`344`	`345`	`"url": self.base_url + page["_links"]["webui"],`
	`346`	`+ "updated_at": page["version"]["when"],`
`345`	`347`	`},`
`346`	`348`	`)`
`347`	`349`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`import chromadb`
`2`	`2`	`from chromadb.config import Settings`
`3`		`-from llama_index.vector_stores.chroma import (`
	`3`	`+from .raw_vector_stores.chromadb import (`
`4`	`4`	`ChromaVectorStore as LIChromaVectorStore,`
`5`	`5`	`)`
`6`	`6`	`from rag.base import BaseVectorStore`