GerevAI · allen-munsch · Mar 30, 2023 · Mar 30, 2023 · Mar 30, 2023 · Mar 30, 2023
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ Coming Soon...
  - [X] Bookstack - by [@flifloo](https://github.com/flifloo) :pray:
  - [X] Mattermost - by [@itaykal](https://github.com/Itaykal) :pray:
  - [X] RocketChat - by [@flifloo](https://github.com/flifloo) :pray:
+ - [X] Stackoverflow Teams - by [@allen-munsch](https://github.com/allen-munsch) :pray:
  - [ ] Gitlab Issues (In PR :pray:)
  - [ ] Zendesk (In PR :pray:)
  - [ ] Azure DevOps (In PR :pray:)

diff --git a/app/api/data_source.py b/app/api/data_source.py
@@ -85,7 +85,7 @@ async def list_locations(request: Request, data_source_name: str, config: dict)
 @router.post("")
 async def connect_data_source(request: Request, dto: AddDataSourceDto, background_tasks: BackgroundTasks) -> int:
     logger.info(f"Adding data source {dto.name} with config {json.dumps(dto.config)}")
-    data_source = DataSourceContext.create_data_source(name=dto.name, config=dto.config)
+    data_source = await DataSourceContext.create_data_source(name=dto.name, config=dto.config)
     Posthog.added_data_source(uuid=request.headers.get('uuid'), name=dto.name)
     # in main.py we have a background task that runs every 5 minutes and indexes the data source
     # but here we want to index the data source immediately

diff --git a/app/clear_ack_queue.sh b/app/clear_ack_queue.sh
@@ -0,0 +1 @@
+sqlite3 ~/.gerev/storage/tasks.sqlite3/data.db 'delete from ack_queue_task where _id in (select _id from ack_queue_task);'
diff --git a/app/clear_data_sources.sh b/app/clear_data_sources.sh
@@ -0,0 +1 @@
+sqlite3 ~/.gerev/storage/db.sqlite3 'delete from data_source where id in (select id from data_source);'
diff --git a/app/data_source/api/base_data_source.py b/app/data_source/api/base_data_source.py
@@ -61,7 +61,7 @@ def get_config_fields() -> List[ConfigField]:
 
     @staticmethod
     @abstractmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         """
         Validates the config and raises an exception if it's invalid.
         """

diff --git a/app/data_source/api/context.py b/app/data_source/api/context.py
@@ -6,9 +6,10 @@
 from data_source.api.base_data_source import BaseDataSource
 from data_source.api.dynamic_loader import DynamicLoader, ClassInfo
 from data_source.api.exception import KnownException
-from db_engine import Session
+from db_engine import Session, async_session
+from pydantic.error_wrappers import ValidationError
 from schemas import DataSourceType, DataSource
-
+from sqlalchemy import select
 
 logger = logging.getLogger(__name__)
 
@@ -48,22 +49,31 @@ def get_data_source_classes(cls) -> Dict[str, BaseDataSource]:
         return cls._data_source_classes
 
     @classmethod
-    def create_data_source(cls, name: str, config: dict) -> BaseDataSource:
-        with Session() as session:
-            data_source_type = session.query(DataSourceType).filter_by(name=name).first()
+    async def create_data_source(cls, name: str, config: dict) -> BaseDataSource:
+        async with async_session() as session:
+            data_source_type = await session.execute(
+                select(DataSourceType).filter_by(name=name)
+            )
+            data_source_type = data_source_type.scalar_one_or_none()
             if data_source_type is None:
                 raise KnownException(message=f"Data source type {name} does not exist")
 
             data_source_class = DynamicLoader.get_data_source_class(name)
             logger.info(f"validating config for data source {name}")
-            data_source_class.validate_config(config)
+            await data_source_class.validate_config(config)
             config_str = json.dumps(config)
 
-            data_source_row = DataSource(type_id=data_source_type.id, config=config_str, created_at=datetime.now())
+            data_source_row = DataSource(
+                type_id=data_source_type.id,
+                config=config_str,
+                created_at=datetime.now(),
+            )
             session.add(data_source_row)
-            session.commit()
+            await session.commit()
 
-            data_source = data_source_class(config=config, data_source_id=data_source_row.id)
+            data_source = data_source_class(
+                config=config, data_source_id=data_source_row.id
+            )
             cls._data_source_instances[data_source_row.id] = data_source
 
             return data_source
@@ -95,8 +105,12 @@ def _load_connected_sources_from_db(cls):
             for data_source in data_sources:
                 data_source_cls = DynamicLoader.get_data_source_class(data_source.type.name)
                 config = json.loads(data_source.config)
-                data_source_instance = data_source_cls(config=config, data_source_id=data_source.id,
-                                                       last_index_time=data_source.last_indexed_at)
+                try:
+                    data_source_instance = data_source_cls(config=config, data_source_id=data_source.id,
+                                                           last_index_time=data_source.last_indexed_at)
+                except ValidationError as e:
+                    logger.error(f"Error loading data source {data_source.id}: {e}")
+                    return
                 cls._data_source_instances[data_source.id] = data_source_instance
 
         cls._initialized = True

diff --git a/app/data_source/api/utils.py b/app/data_source/api/utils.py
@@ -4,12 +4,36 @@
 from functools import lru_cache
 from io import BytesIO
 from typing import Optional
+import time
+import threading
+from functools import wraps
 
 import requests
 
+
 logger = logging.getLogger(__name__)
 
 
+def rate_limit(*, allowed_per_second: int):
+    max_period = 1.0 / allowed_per_second
+    last_call = [time.perf_counter()]
+    lock = threading.Lock()
+
+    def decorate(func):
+        @wraps(func)
+        def limit(*args, **kwargs):
+            with lock:
+                elapsed = time.perf_counter() - last_call[0]
+                hold = max_period - elapsed
+                if hold > 0:
+                    time.sleep(hold)
+                result = func(*args, **kwargs)
+                last_call[0] = time.perf_counter()
+            return result
+        return limit
+    return decorate
+
+
 def snake_case_to_pascal_case(snake_case_string: str):
     """Converts a snake case string to a PascalCase string"""
     components = snake_case_string.split('_')
@@ -55,3 +79,4 @@ def get_confluence_user_image(image_url: str, token: str) -> Optional[str]:
         return f"data:image/jpeg;base64,{base64.b64encode(image_bytes.getvalue()).decode()}"
     except:
         logger.warning(f"Failed to get confluence user image {image_url}")
+
diff --git a/app/data_source/sources/bookstack/bookstack.py b/app/data_source/sources/bookstack/bookstack.py
@@ -132,7 +132,7 @@ def list_books(book_stack: BookStack) -> List[Dict]:
                     raise e
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         try:
             parsed_config = BookStackConfig(**config)
             book_stack = BookStack(url=parsed_config.url, token_id=parsed_config.token_id,

diff --git a/app/data_source/sources/confluence/confluence.py b/app/data_source/sources/confluence/confluence.py
@@ -61,7 +61,7 @@ def list_all_spaces(confluence: Confluence) -> List[Location]:
         return spaces
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         try:
             client = ConfluenceDataSource.confluence_client_from_config(config)
             ConfluenceDataSource.list_spaces(confluence=client)

diff --git a/app/data_source/sources/confluence/confluence_cloud.py b/app/data_source/sources/confluence/confluence_cloud.py
@@ -25,7 +25,7 @@ def get_config_fields() -> List[ConfigField]:
         ]
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         try:
             client = ConfluenceCloudDataSource.confluence_client_from_config(config)
             ConfluenceCloudDataSource.list_spaces(confluence=client)

diff --git a/app/data_source/sources/google_drive/google_drive.py b/app/data_source/sources/google_drive/google_drive.py
@@ -43,7 +43,7 @@ def get_config_fields() -> List[ConfigField]:
         ]
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         try:
             scopes = ['https://www.googleapis.com/auth/drive.readonly']
             parsed_config = GoogleDriveConfig(**config)

diff --git a/app/data_source/sources/mattermost/mattermost.py b/app/data_source/sources/mattermost/mattermost.py
@@ -54,7 +54,7 @@ def get_config_fields() -> List[ConfigField]:
         ]
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         try:
             parsed_config = MattermostConfig(**config)
             maattermost = Driver(options=asdict(parsed_config))

diff --git a/app/data_source/sources/rocketchat/rocketchat.py b/app/data_source/sources/rocketchat/rocketchat.py
@@ -54,7 +54,7 @@ def get_display_name(cls) -> str:
         return "Rocket.Chat"
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         rocket_chat_config = RocketchatConfig(**config)
         should_verify_ssl = os.environ.get('ROCKETCHAT_VERIFY_SSL') is not None
         rocket_chat = RocketChat(user_id=rocket_chat_config.token_id, auth_token=rocket_chat_config.token_secret,

diff --git a/app/data_source/sources/slack/slack.py b/app/data_source/sources/slack/slack.py
@@ -42,7 +42,7 @@ def get_config_fields() -> List[ConfigField]:
         ]
 
     @staticmethod
-    def validate_config(config: Dict) -> None:
+    async def validate_config(config: Dict) -> None:
         slack_config = SlackConfig(**config)
         slack = WebClient(token=slack_config.token)
         slack.auth_test()

diff --git a/app/data_source/sources/stackoverflow/__init__.py b/app/data_source/sources/stackoverflow/__init__.py
diff --git a/app/data_source/sources/stackoverflow/stackoverflow.py b/app/data_source/sources/stackoverflow/stackoverflow.py
@@ -0,0 +1,136 @@
+import logging
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Dict, List, Optional
+import requests
+
+from data_source.api.base_data_source import BaseDataSource, ConfigField, HTMLInputType, BaseDataSourceConfig
+from data_source.api.basic_document import DocumentType, BasicDocument
+from queues.index_queue import IndexQueue
+
+from data_source.api.utils import rate_limit
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StackOverflowPost:
+    link: str
+    score: int
+    last_activity_date: int
+    creation_date: int
+    post_id: Optional[int] = None
+    post_type: Optional[str] = None
+    body_markdown: Optional[str] = None
+    owner_account_id: Optional[int] = None
+    owner_reputation: Optional[int] = None
+    owner_user_id: Optional[int] = None
+    owner_user_type: Optional[str] = None
+    owner_profile_image: Optional[str] = None
+    owner_display_name: Optional[str] = None
+    owner_link: Optional[str] = None
+    title:  Optional[str] = None
+    last_edit_date:  Optional[str] = None
+    tags: Optional[List[str]] = None
+    view_count: Optional[int] = None
+    article_id: Optional[int] = None
+    article_type: Optional[str] = None
+
+class StackOverflowConfig(BaseDataSourceConfig):
+    api_key: str
+    team_name: str
+
+
+@rate_limit(allowed_per_second=15)
+def rate_limited_get(url, headers):
+    '''
+    https://api.stackoverflowteams.com/docs/throttle
+    https://api.stackexchange.com/docs/throttle
+    Every application is subject to an IP based concurrent request throttle.
+    If a single IP is making more than 30 requests a second, new requests will be dropped.
+    The exact ban period is subject to change, but will be on the order of 30 seconds to a few minutes typically.
+    Note that exactly what response an application gets (in terms of HTTP code, text, and so on)
+    is undefined when subject to this ban; we consider > 30 request/sec per IP to be very abusive and thus cut the requests off very harshly.
+    '''
+    resp = requests.get(url, headers=headers)
+    if resp.status_code == 429:
+        logger.warning('Rate limited, sleeping for 5 minutes')
+        time.sleep(300)
+        return rate_limited_get(url, headers)
+    return resp
+
+
+class StackOverflowDataSource(BaseDataSource):
+
+    @staticmethod
+    def get_config_fields() -> List[ConfigField]:
+        return [
+            ConfigField(label="PAT API Key", name="api_key", type=HTMLInputType.TEXT),
+            ConfigField(label="Team Name", name="team_name", type=HTMLInputType.TEXT),
+        ]
+
+    @staticmethod
+    async def validate_config(config: Dict) -> None:
+        so_config = StackOverflowConfig(**config)
+        url = f'https://api.stackoverflowteams.com/2.3/questions?&team={so_config.team_name}'
+        response = rate_limited_get(url, headers={'X-API-Access-Token': so_config.api_key})
+        response.raise_for_status()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        so_config = StackOverflowConfig(**self._raw_config)
+        self._api_key = so_config.api_key
+        self._team_name = so_config.team_name
+
+    def _fetch_posts(self, *, api_key: str, team_name: str, page: int, doc_type: str) -> None:
+        team_fragment = f'&team={team_name}'
+        # this is a filter for "body markdown" inclusion, all filters are unique and static
+        # i am not entirely sure if this is per account, or usable by everyone
+        filter_fragment = '&filter=!nOedRLbqzB'
+        page_fragment = f'&page={page}'
+        # it looked like the timestamp was 10 digits, lets only look at stuff that is newer than the last index time
+        from_date_fragment = f'&fromdate={self._last_index_time.timestamp():.10n}'
+        url = f'https://api.stackoverflowteams.com/2.3/{doc_type}?{team_fragment}{filter_fragment}{page_fragment}{from_date_fragment}'
+        response = rate_limited_get(url, headers={'X-API-Access-Token': api_key})
+        response.raise_for_status()
+        response = response.json()
+        has_more = response['has_more']
+        items = response['items']
+        logger.info(f'Fetched {len(items)} {doc_type} from Stack Overflow')
+        for item_dict in items:
+            owner_fields = {}
+            if 'owner' in item_dict:
+                owner_fields = {f"owner_{k}": v for k, v in item_dict.pop('owner').items()}
+            if 'title' not in item_dict:
+                item_dict['title'] = item_dict['link']
+            post = StackOverflowPost(**item_dict, **owner_fields)
+            last_modified = datetime.fromtimestamp(post.last_edit_date or post.last_activity_date)
+            if last_modified < self._last_index_time:
+                return
+            logger.info(f'Feeding {doc_type} {post.title}')
+            post_document = BasicDocument(title=post.title, content=post.body_markdown, author=post.owner_display_name,
+                                          timestamp=datetime.fromtimestamp(post.creation_date), id=post.post_id,
+                                          data_source_id=self._data_source_id, location=post.link,
+                                          url=post.link, author_image_url=post.owner_profile_image,
+                                          type=DocumentType.MESSAGE)
+            IndexQueue.get_instance().put_single(doc=post_document)
+        if has_more:
+            # paginate onto the queue
+            self.add_task_to_queue(self._fetch_posts, api_key=self._api_key, team_name=self._team_name, page=page + 1, doc_type=doc_type)
+
+    def _feed_new_documents(self) -> None:
+        self.add_task_to_queue(self._fetch_posts, api_key=self._api_key, team_name=self._team_name, page=1, doc_type='posts')
+        # TODO: figure out how to get articles
+        # self.add_task_to_queue(self._fetch_posts, api_key=self._api_key, team_name=self._team_name, page=1, doc_type='articles')
+
+
+# def test():
+#     import os
+#     config = {"api_key": os.environ['SO_API_KEY'], "team_name": os.environ['SO_TEAM_NAME']}
+#     so = StackOverflowDataSource(config=config, data_source_id=1)
+#     so._feed_new_documents()
+#
+#
+# if __name__ == '__main__':
+#     test()
diff --git a/app/db_engine.py b/app/db_engine.py
@@ -5,11 +5,19 @@
 
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
+from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
 # import base document and then register all classes
 from schemas.base import Base
 
 from paths import SQLITE_DB_PATH
 
-engine = create_engine(f'sqlite:///{SQLITE_DB_PATH}')
+db_url = f'sqlite:///{SQLITE_DB_PATH}'
+print('DB engine path:', db_url)
+engine = create_engine(db_url)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
+
+async_db_url = db_url.replace('sqlite', 'sqlite+aiosqlite', 1)
+print('ASYNC DB engine path:', async_db_url)
+async_engine = create_async_engine(async_db_url)
+async_session = sessionmaker(async_engine, expire_on_commit=False, class_=AsyncSession)