-
Notifications
You must be signed in to change notification settings - Fork 2
/
ingestor.py
35 lines (26 loc) · 936 Bytes
/
ingestor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import PyPDF2
from pymongo import MongoClient
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017")
db = client["bot_db"]
collection = db["bot_data"]
# Function to insert data into MongoDB
def insert_into_mongodb(data):
collection.insert_one(data)
# Read CSV file and insert into MongoDB
csv_data = pd.read_csv("sample.csv")
for _, row in csv_data.iterrows():
data = {"content": row["content"]}
insert_into_mongodb(data)
# Read PDF file and insert into MongoDB
def extract_text_from_pdf(pdf_file_path):
with open(pdf_file_path, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
pdf_text = extract_text_from_pdf("sample.pdf")
data = {"content": pdf_text}
insert_into_mongodb(data)