diff --git a/rag/file_conversion_router/conversion/notebook_converter.py b/rag/file_conversion_router/conversion/notebook_converter.py new file mode 100644 index 0000000..5a5c672 --- /dev/null +++ b/rag/file_conversion_router/conversion/notebook_converter.py @@ -0,0 +1,36 @@ +from pathlib import Path +import nbformat +from nbconvert import MarkdownExporter + +from rag.file_convrsion_router.conversion.base_converter import BaseConverter +from rag.file_conversion_router.classes.page import Page + + +class NotebookConverter(BaseConverter): + def __init__(self): + super().__init__() + + # Override + def _to_markdown(self, input_path: Path, output_path: Path) -> Path: + output_path = output_path.with_suffix(".md") + + with open(input_path, "r") as input_file, open(output_path, "w") as output_file: + content = nbformat.read(input_file, as_version=4) + markdown_converter = MarkdownExporter() + (markdown_content, resources) = markdown_converter.from_notebook_node(content) + output_file.write(self._post_process_markdown(markdown_content)) + return output_path + + def _post_process_markdown(self, markdown_content: str) -> str: + lines = markdown_content.split("\n")[1:] # first line is the title of the course section + + processed_lines = [] + for i, line in enumerate(lines): + if i == 1: # convert lecture title to h1 + processed_lines.append(f"# {line.lstrip('#').strip()}") + elif line.startswith("#"): # convert all other heading down one level + processed_lines.append(f"#{line.strip()}") + else: + processed_lines.append(line.strip()) + + return "\n".join(processed_lines) diff --git a/rag/file_conversion_router/conversion/python_converter.py b/rag/file_conversion_router/conversion/python_converter.py new file mode 100644 index 0000000..4090024 --- /dev/null +++ b/rag/file_conversion_router/conversion/python_converter.py @@ -0,0 +1,20 @@ +from pathlib import Path +from rag.file_conversion_router.conversion.base_converter import BaseConverter +from rag.file_conversion_router.classes.page import Page + +class PythonConverter(BaseConverter): + def __init__(self): + super().__init__() + + # Override + def _to_markdown(self, input_path: Path, output_path: Path) -> Path: + """Converts a Python file to a Markdown file by formatting it as a code block.""" + + output_path = output_path.with_suffix(".md") + + with open(input_path, "r") as input_file, open(output_path, "w") as output_file: + content = input_file.read() + markdown_content = f"```python\n{content}\n```" + output_file.write(markdown_content) + + return output_path \ No newline at end of file diff --git a/rag/file_conversion_router/services/directory_service.py b/rag/file_conversion_router/services/directory_service.py index cbeaccb..b37954b 100644 --- a/rag/file_conversion_router/services/directory_service.py +++ b/rag/file_conversion_router/services/directory_service.py @@ -13,6 +13,8 @@ from rag.file_conversion_router.conversion.video_converter import VideoConverter from rag.file_conversion_router.conversion.ed_converter import EdConverter from rag.file_conversion_router.conversion.html_converter import HtmlConverter +from rag.file_conversion_router.conversion.notebook_converter import NotebookConverter +from rag.file_conversion_router.conversion.python_converter import PythonConverter from rag.file_conversion_router.services.task_manager import schedule_conversion from rag.file_conversion_router.utils.logger import content_logger, set_log_file_path from rag.file_conversion_router.utils.conversion_cache import ConversionCache @@ -27,7 +29,9 @@ ".rst": RstConverter, ".mp4": VideoConverter, ".json": EdConverter, - ".html": HtmlConverter + ".html": HtmlConverter, + ".ipynb": NotebookConverter, + ".py": PythonConverter # TODO: Add more file types and converters here } @@ -99,4 +103,4 @@ def process_folder(input_dir: Union[str, Path], output_dir: Union[str, Path], content_logger.info(f"Completed content checking for directory: {input_dir}") logging.info(f"Completed processing for directory: {input_dir}") - logging.info(f"Saved conversion time [{ConversionCache.calc_total_savings()} seconds] by using cached results.") + logging.info(f"Saved conversion time [{ConversionCache.calc_total_savings()} seconds] by using cached results.") \ No newline at end of file diff --git a/rag/requirements-base.txt b/rag/requirements-base.txt index ce3f29b..2eb4086 100644 --- a/rag/requirements-base.txt +++ b/rag/requirements-base.txt @@ -304,4 +304,4 @@ Werkzeug==2.2.3 xxhash==3.5.0 yacs==0.1.8 yapf==0.43.0 -yarl==1.18.3 +yarl==1.18.3 \ No newline at end of file