Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

created notebook and python converter #118

Merged
merged 4 commits into from
Mar 27, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions rag/file_conversion_router/conversion/notebook_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pathlib import Path
import nbformat
from nbconvert import MarkdownExporter

from rag.file_convrsion_router.conversion.base_converter import BaseConverter
from rag.file_conversion_router.classes.page import Page


class NotebookConverter(BaseConverter):
def __init__(self):
super().__init__()

# Override
def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
output_path = output_path.with_suffix(".md")

with open(input_path, "r") as input_file, open(output_path, "w") as output_file:
content = nbformat.read(input_file, as_version=4)
markdown_converter = MarkdownExporter()
(markdown_content, resources) = markdown_converter.from_notebook_node(content)
output_file.write(self._post_process_markdown(markdown_content))
return output_path

def _post_process_markdown(self, markdown_content: str) -> str:
lines = markdown_content.split("\n")[1:] # first line is the title of the course section

processed_lines = []
for i, line in enumerate(lines):
if i == 1: # convert lecture title to h1
processed_lines.append(f"# {line.lstrip('#').strip()}")
elif line.startswith("#"): # convert all other heading down one level
processed_lines.append(f"#{line.strip()}")
else:
processed_lines.append(line.strip())

return "\n".join(processed_lines)
20 changes: 20 additions & 0 deletions rag/file_conversion_router/conversion/python_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pathlib import Path
from rag.file_conversion_router.conversion.base_converter import BaseConverter
from rag.file_conversion_router.classes.page import Page

class PythonConverter(BaseConverter):
def __init__(self):
super().__init__()

# Override
def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
"""Converts a Python file to a Markdown file by formatting it as a code block."""

output_path = output_path.with_suffix(".md")

with open(input_path, "r") as input_file, open(output_path, "w") as output_file:
content = input_file.read()
markdown_content = f"```python\n{content}\n```"
output_file.write(markdown_content)

return output_path
8 changes: 6 additions & 2 deletions rag/file_conversion_router/services/directory_service.py
Original file line number Diff line number Diff line change
@@ -13,6 +13,8 @@
from rag.file_conversion_router.conversion.video_converter import VideoConverter
from rag.file_conversion_router.conversion.ed_converter import EdConverter
from rag.file_conversion_router.conversion.html_converter import HtmlConverter
from rag.file_conversion_router.conversion.notebook_converter import NotebookConverter
from rag.file_conversion_router.conversion.python_converter import PythonConverter
from rag.file_conversion_router.services.task_manager import schedule_conversion
from rag.file_conversion_router.utils.logger import content_logger, set_log_file_path
from rag.file_conversion_router.utils.conversion_cache import ConversionCache
@@ -27,7 +29,9 @@
".rst": RstConverter,
".mp4": VideoConverter,
".json": EdConverter,
".html": HtmlConverter
".html": HtmlConverter,
".ipynb": NotebookConverter,
".py": PythonConverter
# TODO: Add more file types and converters here
}

@@ -99,4 +103,4 @@ def process_folder(input_dir: Union[str, Path], output_dir: Union[str, Path],

content_logger.info(f"Completed content checking for directory: {input_dir}")
logging.info(f"Completed processing for directory: {input_dir}")
logging.info(f"Saved conversion time [{ConversionCache.calc_total_savings()} seconds] by using cached results.")
logging.info(f"Saved conversion time [{ConversionCache.calc_total_savings()} seconds] by using cached results.")
2 changes: 1 addition & 1 deletion rag/requirements-base.txt
Original file line number Diff line number Diff line change
@@ -304,4 +304,4 @@ Werkzeug==2.2.3
xxhash==3.5.0
yacs==0.1.8
yapf==0.43.0
yarl==1.18.3
yarl==1.18.3