Skip to content

Commit 9f33b3d

Browse files
authored
Docker support (Mintplex-Labs#34)
* Updates for Linux for frontend/server * frontend/server docker * updated Dockerfile for deps related to node vectordb * updates for collector in docker * docker deps for ODT processing * ignore another collector dir * storage mount improvements; run as UID * fix pypandoc version typo * permissions fixes
1 parent ebd3a62 commit 9f33b3d

32 files changed

+4921
-200
lines changed

.dockerignore

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
server/storage/documents/**
2+
server/storage/vector-cache/**
3+
server/storage/*.db
4+
server/storage/lancedb
5+
collector/hotdir/**
6+
collector/v-env/**
7+
collector/outputs/**
8+
**/node_modules/
9+
**/dist/
10+
**/v-env/
11+
**/__pycache__/
12+
**/.env
13+
**/.env.*

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,5 @@ v-env
55
node_modules
66
__pycache__
77
v-env
8-
*.lock
98
.DS_Store
109

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ Next, you will need some content to embed. This could be a Youtube Channel, Medi
5252

5353
[Go set up and run collector scripts](./collector/README.md)
5454

55-
[Learn about documents](./server/documents/DOCUMENTS.md)
55+
[Learn about documents](./server/storage/documents/DOCUMENTS.md)
5656

57-
[Learn about vector caching](./server/vector-cache/VECTOR_CACHE.md)
57+
[Learn about vector caching](./server/storage/vector-cache/VECTOR_CACHE.md)
5858

5959
### Contributing
6060
- create issue

collector/main.py

+30-41
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from whaaaaat import prompt, Separator
2+
from InquirerPy import inquirer
33
from scripts.youtube import youtube
44
from scripts.link import link, links
55
from scripts.substack import substack
@@ -20,57 +20,46 @@ def main():
2020
selection = input("Your selection: ")
2121
method = methods.get(str(selection))
2222
else:
23-
questions = [
24-
{
25-
"type": "list",
26-
"name": "collector",
27-
"message": "What kind of data would you like to add to convert into long-term memory?",
28-
"choices": [
29-
"YouTube Channel",
30-
"Substack",
31-
"Medium",
32-
"Article or Blog Link(s)",
33-
"Gitbook",
34-
Separator(),
35-
{"name": "Twitter", "disabled": "Needs PR"},
36-
"Abort",
37-
],
38-
},
39-
]
40-
method = prompt(questions).get('collector')
41-
42-
if('Article or Blog Link' in method):
43-
questions = [
44-
{
45-
"type": "list",
46-
"name": "collector",
47-
"message": "Do you want to scrape a single article/blog/url or many at once?",
48-
"choices": [
49-
'Single URL',
50-
'Multiple URLs',
51-
'Abort',
52-
],
53-
},
54-
]
55-
method = prompt(questions).get('collector')
56-
if(method == 'Single URL'):
23+
method = inquirer.select(
24+
message="What kind of data would you like to add to convert into long-term memory?",
25+
choices=[
26+
{"name": "YouTube Channel", "value": "YouTube Channel"},
27+
{"name": "Substack", "value": "Substack"},
28+
{"name": "Medium", "value": "Medium"},
29+
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
30+
{"name": "Gitbook", "value": "Gitbook"},
31+
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
32+
{"name": "Abort", "value": "Abort"},
33+
],
34+
).execute()
35+
36+
if 'Article or Blog Link' in method:
37+
method = inquirer.select(
38+
message="Do you want to scrape a single article/blog/url or many at once?",
39+
choices=[
40+
{"name": "Single URL", "value": "Single URL"},
41+
{"name": "Multiple URLs", "value": "Multiple URLs"},
42+
{"name": "Abort", "value": "Abort"},
43+
],
44+
).execute()
45+
if method == 'Single URL':
5746
link()
5847
exit(0)
59-
if(method == 'Multiple URLs'):
48+
if method == 'Multiple URLs':
6049
links()
6150
exit(0)
6251

63-
if(method == 'Abort'): exit(0)
64-
if(method == 'YouTube Channel'):
52+
if method == 'Abort': exit(0)
53+
if method == 'YouTube Channel':
6554
youtube()
6655
exit(0)
67-
if(method == 'Substack'):
56+
if method == 'Substack':
6857
substack()
6958
exit(0)
70-
if(method == 'Medium'):
59+
if method == 'Medium':
7160
medium()
7261
exit(0)
73-
if(method == 'Gitbook'):
62+
if method == 'Gitbook':
7463
gitbook()
7564
exit(0)
7665

collector/requirements.txt

+6-123
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ cryptography==41.0.1
2020
cssselect==1.2.0
2121
dataclasses-json==0.5.7
2222
Deprecated==1.2.14
23+
docx2txt==0.8
2324
et-xmlfile==1.1.0
2425
exceptiongroup==1.1.1
2526
fake-useragent==1.1.3
@@ -30,6 +31,7 @@ h11==0.14.0
3031
httpcore==0.16.3
3132
httpx==0.23.3
3233
idna==3.4
34+
InquirerPy==0.3.4
3335
importlib-metadata==6.6.0
3436
importlib-resources==5.12.0
3537
install==1.3.5
@@ -54,132 +56,13 @@ pandas==1.5.3
5456
parse==1.19.0
5557
pdfminer.six==20221105
5658
Pillow==9.5.0
57-
prompt-toolkit==1.0.14
59+
prompt-toolkit==3.0.38
5860
pycparser==2.21
5961
pydantic==1.10.8
6062
pyee==8.2.2
6163
Pygments==2.15.1
62-
pyobjc==9.1.1
63-
pyobjc-core==9.1.1
64-
pyobjc-framework-Accounts==9.1.1
65-
pyobjc-framework-AddressBook==9.1.1
66-
pyobjc-framework-AdSupport==9.1.1
67-
pyobjc-framework-AppleScriptKit==9.1.1
68-
pyobjc-framework-AppleScriptObjC==9.1.1
69-
pyobjc-framework-ApplicationServices==9.1.1
70-
pyobjc-framework-AudioVideoBridging==9.1.1
71-
pyobjc-framework-AuthenticationServices==9.1.1
72-
pyobjc-framework-AutomaticAssessmentConfiguration==9.1.1
73-
pyobjc-framework-Automator==9.1.1
74-
pyobjc-framework-AVFoundation==9.1.1
75-
pyobjc-framework-AVKit==9.1.1
76-
pyobjc-framework-BusinessChat==9.1.1
77-
pyobjc-framework-CalendarStore==9.1.1
78-
pyobjc-framework-CFNetwork==9.1.1
79-
pyobjc-framework-CloudKit==9.1.1
80-
pyobjc-framework-Cocoa==9.1.1
81-
pyobjc-framework-Collaboration==9.1.1
82-
pyobjc-framework-ColorSync==9.1.1
83-
pyobjc-framework-Contacts==9.1.1
84-
pyobjc-framework-ContactsUI==9.1.1
85-
pyobjc-framework-CoreAudio==9.1.1
86-
pyobjc-framework-CoreAudioKit==9.1.1
87-
pyobjc-framework-CoreBluetooth==9.1.1
88-
pyobjc-framework-CoreData==9.1.1
89-
pyobjc-framework-CoreHaptics==9.1.1
90-
pyobjc-framework-CoreLocation==9.1.1
91-
pyobjc-framework-CoreMedia==9.1.1
92-
pyobjc-framework-CoreMediaIO==9.1.1
93-
pyobjc-framework-CoreMIDI==9.1.1
94-
pyobjc-framework-CoreML==9.1.1
95-
pyobjc-framework-CoreMotion==9.1.1
96-
pyobjc-framework-CoreServices==9.1.1
97-
pyobjc-framework-CoreSpotlight==9.1.1
98-
pyobjc-framework-CoreText==9.1.1
99-
pyobjc-framework-CoreWLAN==9.1.1
100-
pyobjc-framework-CryptoTokenKit==9.1.1
101-
pyobjc-framework-DeviceCheck==9.1.1
102-
pyobjc-framework-DictionaryServices==9.1.1
103-
pyobjc-framework-DiscRecording==9.1.1
104-
pyobjc-framework-DiscRecordingUI==9.1.1
105-
pyobjc-framework-DiskArbitration==9.1.1
106-
pyobjc-framework-DVDPlayback==9.1.1
107-
pyobjc-framework-EventKit==9.1.1
108-
pyobjc-framework-ExceptionHandling==9.1.1
109-
pyobjc-framework-ExecutionPolicy==9.1.1
110-
pyobjc-framework-ExternalAccessory==9.1.1
111-
pyobjc-framework-FileProvider==9.1.1
112-
pyobjc-framework-FileProviderUI==9.1.1
113-
pyobjc-framework-FinderSync==9.1.1
114-
pyobjc-framework-FSEvents==9.1.1
115-
pyobjc-framework-GameCenter==9.1.1
116-
pyobjc-framework-GameController==9.1.1
117-
pyobjc-framework-GameKit==9.1.1
118-
pyobjc-framework-GameplayKit==9.1.1
119-
pyobjc-framework-ImageCaptureCore==9.1.1
120-
pyobjc-framework-IMServicePlugIn==9.1.1
121-
pyobjc-framework-InputMethodKit==9.1.1
122-
pyobjc-framework-InstallerPlugins==9.1.1
123-
pyobjc-framework-InstantMessage==9.1.1
124-
pyobjc-framework-Intents==9.1.1
125-
pyobjc-framework-IOBluetooth==9.1.1
126-
pyobjc-framework-IOBluetoothUI==9.1.1
127-
pyobjc-framework-IOSurface==9.1.1
128-
pyobjc-framework-iTunesLibrary==9.1.1
129-
pyobjc-framework-LatentSemanticMapping==9.1.1
130-
pyobjc-framework-LaunchServices==9.1.1
131-
pyobjc-framework-libdispatch==9.1.1
132-
pyobjc-framework-libxpc==9.1.1
133-
pyobjc-framework-LinkPresentation==9.1.1
134-
pyobjc-framework-LocalAuthentication==9.1.1
135-
pyobjc-framework-MapKit==9.1.1
136-
pyobjc-framework-MediaAccessibility==9.1.1
137-
pyobjc-framework-MediaLibrary==9.1.1
138-
pyobjc-framework-MediaPlayer==9.1.1
139-
pyobjc-framework-MediaToolbox==9.1.1
140-
pyobjc-framework-Metal==9.1.1
141-
pyobjc-framework-MetalKit==9.1.1
142-
pyobjc-framework-MetalPerformanceShaders==9.1.1
143-
pyobjc-framework-ModelIO==9.1.1
144-
pyobjc-framework-MultipeerConnectivity==9.1.1
145-
pyobjc-framework-NaturalLanguage==9.1.1
146-
pyobjc-framework-NetFS==9.1.1
147-
pyobjc-framework-Network==9.1.1
148-
pyobjc-framework-NetworkExtension==9.1.1
149-
pyobjc-framework-NotificationCenter==9.1.1
150-
pyobjc-framework-OpenDirectory==9.1.1
151-
pyobjc-framework-OSAKit==9.1.1
152-
pyobjc-framework-OSLog==9.1.1
153-
pyobjc-framework-PencilKit==9.1.1
154-
pyobjc-framework-Photos==9.1.1
155-
pyobjc-framework-PhotosUI==9.1.1
156-
pyobjc-framework-PreferencePanes==9.1.1
157-
pyobjc-framework-PushKit==9.1.1
158-
pyobjc-framework-Quartz==9.1.1
159-
pyobjc-framework-QuickLookThumbnailing==9.1.1
160-
pyobjc-framework-SafariServices==9.1.1
161-
pyobjc-framework-SceneKit==9.1.1
162-
pyobjc-framework-ScreenSaver==9.1.1
163-
pyobjc-framework-ScriptingBridge==9.1.1
164-
pyobjc-framework-SearchKit==9.1.1
165-
pyobjc-framework-Security==9.1.1
166-
pyobjc-framework-SecurityFoundation==9.1.1
167-
pyobjc-framework-SecurityInterface==9.1.1
168-
pyobjc-framework-ServiceManagement==9.1.1
169-
pyobjc-framework-Social==9.1.1
170-
pyobjc-framework-SoundAnalysis==9.1.1
171-
pyobjc-framework-Speech==9.1.1
172-
pyobjc-framework-SpriteKit==9.1.1
173-
pyobjc-framework-StoreKit==9.1.1
174-
pyobjc-framework-SyncServices==9.1.1
175-
pyobjc-framework-SystemConfiguration==9.1.1
176-
pyobjc-framework-SystemExtensions==9.1.1
177-
pyobjc-framework-UserNotifications==9.1.1
178-
pyobjc-framework-VideoSubscriberAccount==9.1.1
179-
pyobjc-framework-VideoToolbox==9.1.1
180-
pyobjc-framework-Vision==9.1.1
181-
pyobjc-framework-WebKit==9.1.1
18264
pypandoc==1.4
65+
pypdf==3.9.0
18366
pyppeteer==1.0.2
18467
pyquery==2.0.0
18568
python-dateutil==2.8.2
@@ -199,6 +82,7 @@ six==1.16.0
19982
sniffio==1.3.0
20083
soupsieve==2.4.1
20184
SQLAlchemy==2.0.15
85+
tabulate==0.9.0
20286
tenacity==8.2.2
20387
text-unidecode==1.3
20488
tiktoken==0.4.0
@@ -212,10 +96,9 @@ uuid==1.30
21296
w3lib==2.1.1
21397
wcwidth==0.2.6
21498
websockets==10.4
215-
whaaaaat==0.5.2
21699
wrapt==1.14.1
217100
xlrd==2.0.1
218101
XlsxWriter==3.1.2
219102
yarl==1.9.2
220103
youtube-transcript-api==0.6.0
221-
zipp==3.15.0
104+
zipp==3.15.0

collector/scripts/gitbook.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def gitbook():
1414

1515
primary_source = urlparse(url)
1616
output_path = f"./outputs/gitbook-logs/{primary_source.netloc}"
17-
transaction_output_dir = f"../server/documents/gitbook-{primary_source.netloc}"
17+
transaction_output_dir = f"../server/storage/documents/gitbook-{primary_source.netloc}"
1818

1919
if os.path.exists(output_path) == False:os.makedirs(output_path)
2020
if os.path.exists(transaction_output_dir) == False: os.makedirs(transaction_output_dir)

collector/scripts/link.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def link():
3636
output_path = f"./outputs/website-logs"
3737

3838
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
39-
transaction_output_dir = f"../server/documents/website-{source.netloc}"
39+
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
4040

4141
if os.path.isdir(output_path) == False:
4242
os.makedirs(output_path)
@@ -109,7 +109,7 @@ def links():
109109
output_path = f"./outputs/website-logs"
110110

111111
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
112-
transaction_output_dir = f"../server/documents/website-{source.netloc}"
112+
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
113113

114114
if os.path.isdir(output_path) == False:
115115
os.makedirs(output_path)

collector/scripts/medium.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def medium():
2323
exit(1)
2424

2525
totalTokenCount = 0
26-
transaction_output_dir = f"../server/documents/medium-{handle}"
26+
transaction_output_dir = f"../server/storage/documents/medium-{handle}"
2727
if os.path.isdir(transaction_output_dir) == False:
2828
os.makedirs(transaction_output_dir)
2929

collector/scripts/substack.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def substack():
2727
print(f"{len(valid_publications)} of {len(publications)} publications are readable publically text posts - collecting those.")
2828

2929
totalTokenCount = 0
30-
transaction_output_dir = f"../server/documents/substack-{subdomain}"
30+
transaction_output_dir = f"../server/storage/documents/substack-{subdomain}"
3131
if os.path.isdir(transaction_output_dir) == False:
3232
os.makedirs(transaction_output_dir)
3333

collector/scripts/watch/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def move_source(working_dir='hotdir', new_destination_filename= ''):
2424
return
2525

2626
def write_to_server_documents(data, filename):
27-
destination = f"../server/documents/custom-documents"
27+
destination = f"../server/storage/documents/custom-documents"
2828
if os.path.exists(destination) == False: os.makedirs(destination)
2929
with open(f"{destination}/{filename}.json", 'w', encoding='utf-8') as file:
3030
json.dump(data, file, ensure_ascii=True, indent=4)

collector/scripts/youtube.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def youtube():
1717
exit(1)
1818

1919
channel_data = fetch_channel_video_information(channel_id)
20-
transaction_output_dir = f"../server/documents/youtube-{channel_data.get('channelTitle')}"
20+
transaction_output_dir = f"../server/storage/documents/youtube-{channel_data.get('channelTitle')}"
2121

2222
if os.path.isdir(transaction_output_dir) == False:
2323
os.makedirs(transaction_output_dir)

docker/.env.example

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
SERVER_PORT=3001
2+
OPEN_AI_KEY=
3+
OPEN_MODEL_PREF='gpt-3.5-turbo'
4+
CACHE_VECTORS="true"
5+
6+
# Enable all below if you are using vector database: Chroma.
7+
# VECTOR_DB="chroma"
8+
# CHROMA_ENDPOINT='http://localhost:8000'
9+
10+
# Enable all below if you are using vector database: Pinecone.
11+
VECTOR_DB="pinecone"
12+
PINECONE_ENVIRONMENT=
13+
PINECONE_API_KEY=
14+
PINECONE_INDEX=
15+
16+
# Enable all below if you are using vector database: LanceDB.
17+
# VECTOR_DB="lancedb"
18+
19+
# CLOUD DEPLOYMENT VARIRABLES ONLY
20+
# AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
21+
# JWT_SECRET="my-random-string-for-seeding" # Only needed if AUTH_TOKEN is set. Please generate random string at least 12 chars long.
22+
STORAGE_DIR="./server/storage"
23+
GOOGLE_APIS_KEY=
24+
UID='1000'
25+
GID='1000'

0 commit comments

Comments
 (0)