Skip to content

feat(data): persist data in git repo #401

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ py-cord==2.6.1
python-dotenv==1.1.0
requests==2.32.3
requests-oauthlib==2.0.0
tinydb==4.8.2
261 changes: 255 additions & 6 deletions src/common/database.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,271 @@
# standard imports
import os
from pathlib import Path
import shelve
import threading
import traceback
from typing import Union

# lib imports
import git
from tinydb import TinyDB
from tinydb.storages import JSONStorage
from tinydb.middlewares import CachingMiddleware

# local imports
from src.common.common import data_dir

# Constants
DATA_REPO_LOCK = threading.Lock()


class Database:
def __init__(self, db_path):
self.db_path = db_path
def __init__(self, db_name: str, db_dir: Union[str, Path] = data_dir, use_git: bool = True):
self.db_name = db_name
self.db_dir = db_dir

# Check for CI environment
is_ci = os.environ.get('GITHUB_PYTEST', '').lower() == 'true'

self.use_git = use_git and not is_ci

self.repo_url = None
self.repo_branch = None
if self.use_git:
self.repo_url = os.getenv("DATA_REPO", "https://github.com/LizardByte/support-bot-data")
self.repo_branch = os.getenv("DATA_REPO_BRANCH", "master")
self.db_dir = os.path.join(self.db_dir, "support-bot-data")

Check warning on line 37 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L35-L37

Added lines #L35 - L37 were not covered by tests

if not os.path.exists(self.db_dir):

Check warning on line 39 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L39

Added line #L39 was not covered by tests
# Clone repo if it doesn't exist
print(f"Cloning repository {self.repo_url} to {self.db_dir}")
try:

Check warning on line 42 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L41-L42

Added lines #L41 - L42 were not covered by tests
# Try cloning with the specified branch
self.repo = git.Repo.clone_from(self.repo_url, self.db_dir, branch=self.repo_branch)
except git.exc.GitCommandError as e:

Check warning on line 45 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L44-L45

Added lines #L44 - L45 were not covered by tests
# Check if the error is due to branch not found
if "Remote branch" in str(e) and "not found in upstream origin" in str(e):
print(f"Branch '{self.repo_branch}' not found in remote. Creating a new empty branch.")

Check warning on line 48 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L47-L48

Added lines #L47 - L48 were not covered by tests
# Clone with default branch first
self.repo = git.Repo.clone_from(self.repo_url, self.db_dir)

Check warning on line 50 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L50

Added line #L50 was not covered by tests

# Create a new orphan branch (not based on any other branch)
self.repo.git.checkout('--orphan', self.repo_branch)

Check warning on line 53 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L53

Added line #L53 was not covered by tests

# Clear the index and working tree
try:
self.repo.git.rm('-rf', '.', '--cached')
except git.exc.GitCommandError:

Check warning on line 58 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L56-L58

Added lines #L56 - L58 were not covered by tests
# This might fail if there are no files yet, which is fine
pass

Check warning on line 60 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L60

Added line #L60 was not covered by tests

# Remove all files in the directory except .git
for item in os.listdir(self.db_dir):
if item != '.git':
item_path = os.path.join(self.db_dir, item)
if os.path.isdir(item_path):
import shutil
shutil.rmtree(item_path)

Check warning on line 68 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L63-L68

Added lines #L63 - L68 were not covered by tests
else:
os.remove(item_path)

Check warning on line 70 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L70

Added line #L70 was not covered by tests

# Create empty .gitkeep file to ensure the branch can be committed
gitkeep_path = os.path.join(self.db_dir, '.gitkeep')
with open(gitkeep_path, 'w'):
pass

Check warning on line 75 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L73-L75

Added lines #L73 - L75 were not covered by tests

# Add and commit the .gitkeep file
self.repo.git.add(gitkeep_path)
self.repo.git.commit('-m', f"Initialize empty branch '{self.repo_branch}'")

Check warning on line 79 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L78-L79

Added lines #L78 - L79 were not covered by tests

# Push the new branch to remote
try:
self.repo.git.push('--set-upstream', 'origin', self.repo_branch)
print(f"Created and pushed new empty branch '{self.repo_branch}'")
except git.exc.GitCommandError as e:
print(f"Failed to push new branch: {str(e)}")

Check warning on line 86 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L82-L86

Added lines #L82 - L86 were not covered by tests
# Continue anyway - we might not have push permissions
else:
# Re-raise if it's a different error
raise

Check warning on line 90 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L90

Added line #L90 was not covered by tests
else:
# Use existing repo
self.repo = git.Repo(self.db_dir)

Check warning on line 93 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L93

Added line #L93 was not covered by tests

# Make sure the correct branch is checked out
if self.repo_branch not in [ref.name.split('/')[-1] for ref in self.repo.refs]:

Check warning on line 96 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L96

Added line #L96 was not covered by tests
# Branch doesn't exist locally, check if it exists remotely
try:
self.repo.git.fetch('origin')
remote_branches = [ref.name.split('/')[-1] for ref in self.repo.remote().refs]

Check warning on line 100 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L98-L100

Added lines #L98 - L100 were not covered by tests

if self.repo_branch in remote_branches:

Check warning on line 102 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L102

Added line #L102 was not covered by tests
# Checkout existing remote branch
self.repo.git.checkout(self.repo_branch)

Check warning on line 104 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L104

Added line #L104 was not covered by tests
else:
# Create new orphan branch
self.repo.git.checkout('--orphan', self.repo_branch)
self.repo.git.rm('-rf', '.', '--cached')

Check warning on line 108 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L107-L108

Added lines #L107 - L108 were not covered by tests

# Create empty .gitkeep file
gitkeep_path = os.path.join(self.db_dir, '.gitkeep')
with open(gitkeep_path, 'w'):
pass

Check warning on line 113 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L111-L113

Added lines #L111 - L113 were not covered by tests

self.repo.git.add(gitkeep_path)
self.repo.git.commit('-m', f"Initialize empty branch '{self.repo_branch}'")
self.repo.git.push('--set-upstream', 'origin', self.repo_branch)
print(f"Created and pushed new empty branch '{self.repo_branch}'")
except git.exc.GitCommandError:
print(f"Failed to work with branch '{self.repo_branch}'. Using current branch instead.")

Check warning on line 120 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L115-L120

Added lines #L115 - L120 were not covered by tests
else:
# Branch exists locally, make sure it's checked out
self.repo.git.checkout(self.repo_branch)

Check warning on line 123 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L123

Added line #L123 was not covered by tests

self.json_path = os.path.join(self.db_dir, f"{self.db_name}.json")
self.shelve_path = os.path.join(db_dir, self.db_name) # Shelve adds its own extensions
self.lock = threading.Lock()

# Check if migration is needed before creating TinyDB instance
self._check_for_migration()

# Initialize the TinyDB instance with CachingMiddleware
self.tinydb = TinyDB(
self.json_path,
storage=CachingMiddleware(JSONStorage),
indent=4,
)

def _check_for_migration(self):
# Check if migration is needed (shelve exists but json doesn't)
# No extension is used on Linux
shelve_exists = os.path.exists(f"{self.shelve_path}.dat") or os.path.exists(self.shelve_path)
json_exists = os.path.exists(self.json_path)

if shelve_exists and not json_exists:
print(f"Migrating database from shelve to TinyDB: {self.shelve_path}")
self._migrate_from_shelve()

def _migrate_from_shelve(self):
try:
# Create a temporary database just for migration
migration_db = TinyDB(
self.json_path,
storage=CachingMiddleware(JSONStorage),
indent=4,
)

# Determine if this is the Reddit database
is_reddit_db = "reddit_bot" in self.db_name

# Open the shelve database
with shelve.open(self.shelve_path) as shelve_db:
# Process each key in the shelve database
for key in shelve_db.keys():
value = shelve_db[key]

# If value is a dict and looks like a collection of records
if isinstance(value, dict) and all(isinstance(k, str) for k in value.keys()):
table = migration_db.table(key)

# Insert each record into TinyDB with proper fields
for record_id, record_data in value.items():
if isinstance(record_data, dict):
if is_reddit_db:
# Check if it's a comment or submission
is_comment = 'body' in record_data

if is_comment:
# For comments
simplified_record = {
'reddit_id': record_data.get('id', record_id),
'author': record_data.get('author'),
'body': record_data.get('body'),
'created_utc': record_data.get('created_utc', 0),
'processed': record_data.get('processed', False),
'slash_command': record_data.get('slash_command', {
'project': None,
'command': None,
}),
}
else:
# For submissions
simplified_record = {
'reddit_id': record_data.get('id', record_id),
'title': record_data.get('title'),
'selftext': record_data.get('selftext'),
'author': str(record_data.get('author')),
'created_utc': record_data.get('created_utc', 0),
'permalink': record_data.get('permalink'),
'url': record_data.get('url'),
'link_flair_text': record_data.get('link_flair_text'),
'link_flair_background_color': record_data.get(
'link_flair_background_color'),
'bot_discord': record_data.get('bot_discord', {
'sent': False,
'sent_utc': None,
}),
}

table.insert(simplified_record)
else:
# Non-Reddit databases keep original structure
record_data['id'] = record_id
table.insert(record_data)

# Flush changes to disk
migration_db.storage.flush()
migration_db.close()

print(f"Migration completed successfully: {self.json_path}")
except Exception as e:
print(f"Migration failed: {str(e)}")
traceback.print_exc()

def __enter__(self):
self.lock.acquire()
self.db = shelve.open(self.db_path, writeback=True)
return self.db
return self.tinydb

def __exit__(self, exc_type, exc_val, exc_tb):
self.sync()
self.db.close()
self.lock.release()

def sync(self):
self.db.sync()
# Only call flush if using CachingMiddleware
if hasattr(self.tinydb.storage, 'flush'):
self.tinydb.storage.flush()

# Git operations - commit and push changes if using git
with DATA_REPO_LOCK:
if self.use_git and self.repo is not None:
try:

Check warning on line 241 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L241

Added line #L241 was not covered by tests
# Check for untracked database files and tracked files with changes
status = self.repo.git.status('--porcelain')

Check warning on line 243 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L243

Added line #L243 was not covered by tests

# If there are any changes or untracked files
if status:

Check warning on line 246 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L246

Added line #L246 was not covered by tests
# Add ALL json files in the directory to ensure we track all databases
json_files = [f for f in os.listdir(self.db_dir) if f.endswith('.json')]
if json_files:
for json_file in json_files:
file_path = os.path.join(self.db_dir, json_file)
self.repo.git.add(file_path)

Check warning on line 252 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L248-L252

Added lines #L248 - L252 were not covered by tests

# Check if we have anything to commit after adding
if self.repo.git.status('--porcelain'):

Check warning on line 255 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L255

Added line #L255 was not covered by tests
# Commit all changes at once with a general message
commit_message = "Update database files"
self.repo.git.commit('-m', commit_message)
print("Committed changes to git data repository")

Check warning on line 259 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L257-L259

Added lines #L257 - L259 were not covered by tests

# Push to remote
try:
origin = self.repo.remote('origin')
origin.push()
print("Pushed changes to remote git data repository")
except git.exc.GitCommandError as e:
print(f"Failed to push changes: {str(e)}")

Check warning on line 267 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L262-L267

Added lines #L262 - L267 were not covered by tests

except Exception as e:
print(f"Git operation failed: {str(e)}")
traceback.print_exc()

Check warning on line 271 in src/common/database.py

View check run for this annotation

Codecov / codecov/patch

src/common/database.py#L269-L271

Added lines #L269 - L271 were not covered by tests
60 changes: 42 additions & 18 deletions src/common/webapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import discord
from flask import Flask, jsonify, redirect, request, Response, send_from_directory
from requests_oauthlib import OAuth2Session
from tinydb import Query
from werkzeug.middleware.proxy_fix import ProxyFix

# local imports
Expand Down Expand Up @@ -109,8 +110,7 @@ def discord_callback():
return Response(html.escape(request.args['error_description']), status=400)

# get all active states from the global state manager
with globals.DISCORD_BOT.db as db:
active_states = db['oauth_states']
active_states = globals.DISCORD_BOT.oauth_states

discord_oauth = OAuth2Session(DISCORD_CLIENT_ID, redirect_uri=DISCORD_REDIRECT_URI)
token = discord_oauth.fetch_token(
Expand Down Expand Up @@ -144,19 +144,32 @@ def discord_callback():
connections_response = discord_oauth.get("https://discord.com/api/users/@me/connections")
connections = connections_response.json()

# Default user data
user_data = {
'user_id': int(discord_user['id']),
'discord_username': discord_user['username'],
'discord_global_name': discord_user['global_name'],
'github_id': None,
'github_username': None,
}

# Check for GitHub connections
for connection in connections:
if connection['type'] == 'github':
user_data['github_id'] = int(connection['id'])
user_data['github_username'] = connection['name']

with globals.DISCORD_BOT.db as db:
db['discord_users'] = db.get('discord_users', {})
db['discord_users'][discord_user['id']] = {
'discord_username': discord_user['username'],
'discord_global_name': discord_user['global_name'],
'github_id': None,
'github_username': None,
}
query = Query()

# Get the discord_users table
discord_users_table = db.table('discord_users')

for connection in connections:
if connection['type'] == 'github':
db['discord_users'][discord_user['id']]['github_id'] = connection['id']
db['discord_users'][discord_user['id']]['github_username'] = connection['name']
# Upsert the user data
discord_users_table.upsert(
user_data,
query.user_id == int(discord_user['id'])
)

globals.DISCORD_BOT.update_cached_message(
author_id=discord_user['id'],
Expand All @@ -177,8 +190,7 @@ def github_callback():
state = request.args.get('state')

# get all active states from the global state manager
with globals.DISCORD_BOT.db as db:
active_states = db['oauth_states']
active_states = globals.DISCORD_BOT.oauth_states

github_oauth = OAuth2Session(GITHUB_CLIENT_ID, redirect_uri=GITHUB_REDIRECT_URI)
token = github_oauth.fetch_token(
Expand Down Expand Up @@ -215,14 +227,26 @@ def github_callback():
discord_user = discord_user_future.result()

with globals.DISCORD_BOT.db as db:
db['discord_users'] = db.get('discord_users', {})
db['discord_users'][discord_user_id] = {
query = Query()

# Get the discord_users table
discord_users_table = db.table('discord_users')

# Create user data object
user_data = {
'user_id': int(discord_user_id),
'discord_username': discord_user.name,
'discord_global_name': discord_user.global_name,
'github_id': github_user['id'],
'github_id': int(github_user['id']),
'github_username': github_user['login'],
}

# Upsert the user data (insert or update)
discord_users_table.upsert(
user_data,
query.user_id == int(discord_user_id)
)

globals.DISCORD_BOT.update_cached_message(
author_id=discord_user_id,
reason='success',
Expand Down
Loading
Loading