nlmatics
diff --git a/‎Dockerfile
Lines changed: 17 additions & 5 deletions b/‎Dockerfile
Lines changed: 17 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 22 additions & 13 deletions b/‎README.md
Lines changed: 22 additions & 13 deletions
diff --git a/‎nlm_ingestor/file_parser/tika_parser.py
Lines changed: 1 addition & 1 deletion b/‎nlm_ingestor/file_parser/tika_parser.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎nlm_ingestor/ingestion_daemon/__main__.py
Lines changed: 7 additions & 5 deletions b/‎nlm_ingestor/ingestion_daemon/__main__.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎nlm_ingestor/ingestor/ingestor_api.py
Lines changed: 1 addition & 0 deletions b/‎nlm_ingestor/ingestor/ingestor_api.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎nlm_ingestor/ingestor/pdf_ingestor.py
Lines changed: 0 additions & 3 deletions b/‎nlm_ingestor/ingestor/pdf_ingestor.py
Lines changed: 0 additions & 3 deletions
@@ -5,10 +5,22 @@ ENV APP_HOME /app
 # install Java
 RUN mkdir -p /usr/share/man/man1 && \
     apt-get update -y && \
-    apt-get install -y openjdk-17-jre-headless && \
-    apt-get install -y libxml2-dev && \
-    apt-get install -y libxslt-dev && \
-    apt-get install -y build-essential
+    apt-get install -y openjdk-17-jre-headless
+# install essential packages
+RUN apt-get install -y \
+    libxml2-dev libxslt-dev \
+    build-essential libmagic-dev
+# install tesseract
+RUN apt-get install -y \
+    tesseract-ocr \
+    lsb-release \
+    && echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \
+    && apt-get update -oAcquire::AllowInsecureRepositories=true \
+    && apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \
+    && apt-get update \
+    && apt-get install -y \
+    tesseract-ocr libtesseract-dev \
+    && wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
 RUN apt-get install unzip -y && \
     apt-get install git -y && \
     apt-get autoremove -y
@@ -21,4 +33,4 @@ RUN pip install -r requirements.txt
 RUN python -m nltk.downloader stopwords
 RUN python -m nltk.downloader punkt
 RUN chmod +x run.sh
-CMD ./run.sh
+CMD ./run.sh
@@ -8,15 +8,16 @@ The PDF parser works off text layer and also offers a OCR option (apply_ocr) to
 Check out the notebook [pdf_visual_ingestor_step_by_step](notebooks/pdf_visual_ingestor_step_by_step.ipynb) to experiment directly with the PDF parser.
 
 The PDF Parser offers the following features:
-    1. Sections and subsections along with their levels.
-    2. Paragraphs - combines lines.
-    3. Links between sections and paragraphs.
-    5. Tables along with the section the tables are found in.
-    6. Lists and nested lists.
-    7. Join content spread across pages.
-    8. Removal of repeating headers and footers.
-    9. Watermark removal.
-    10. OCR with boundary boxes
+
+1. Sections and subsections along with their levels.
+2. Paragraphs - combines lines.
+3. Links between sections and paragraphs.
+5. Tables along with the section the tables are found in.
+6. Lists and nested lists.
+7. Join content spread across pages.
+8. Removal of repeating headers and footers.
+9. Watermark removal.
+10. OCR with boundary boxes
 
 ### HTML
 A special HTML parser that creates layout aware blocks to make RAG performance better with higher quality chunks. 
@@ -47,14 +48,22 @@ In some cases, your PDFs may result in errors in the Java server and you will ne
 python -m nlm_ingestor.ingestion_daemon
 ```
 ### Run the docker file
-A docker image is available via github container registry. Before running the following code, you may need to authenticate with docker first
-cat ~/TOKEN.txt | docker login https://ghcr.io -u USERNAME --password-stdin
-where TOKEN.txt is the token you create as described here: https://docs.github.com/en/[email protected]/packages/working-with-a-github-packages-registry/working-with-the-docker-registry
+A docker image is available via public github container registry. 
 
+Pull the docker image
 ```
 docker pull ghcr.io/nlmatics/nlm-ingestor:latest
-docker run nlm-ingestor-<version>
 ```
+Run the docker image mapping the port 5001 to port of your choice. 
+```
+docker run -p 5010:5001 ghcr.io/nlmatics/nlm-ingestor:latest-<version>
+```
+Once you have the server running, your llmsherpa url will be:
+"http://localhost:5010/api/parseDocument?renderFormat=all"
+- to apply OCR add &applyOcr=yes
+- to use the new indent parser which uses a different alogrithm to assign header levels, add &useNewIndentParser=yes
+- this server is good for your development - in production it is recommended to run this behind a secure gateway using nginx or cloud gateways
+
 ### Test the ingestor server
 Sample test code to test the server with llmsherpa parser is in this [notebook](notebooks/test_llmsherpa_api.ipynb).
 
 
@@ -14,7 +14,7 @@ def __init__(self):
 
     def parse_to_html(self, filepath, do_ocr=False):
         # Turn off OCR by default
-        timeout = 9000
+        timeout = 3000
         headers = {
             "X-Tika-OCRskipOcr": "true"
         }
 
@@ -20,15 +20,17 @@ def parse_document(
     render_format: str = "all",
 ):
     render_format = request.args.get('renderFormat', 'all')
-    use_new_indent_parser = request.args.get('useNewIndentParser', 'all')
+    use_new_indent_parser = request.args.get('useNewIndentParser', 'no')
+    apply_ocr = request.args.get('applyOcr', 'no')
     file = request.files['file']
     tmp_file = None
     try:
         parse_options = {
             "parse_and_render_only": True,
             "render_format": render_format,
-            "use_new_indent_parser": use_new_indent_parser,
-            "parse_pages": ()
+            "use_new_indent_parser": use_new_indent_parser == "yes",
+            "parse_pages": (),
+            "apply_ocr": apply_ocr == "yes"
         }
         # save the incoming file to a temporary location
         filename = secure_filename(file.filename)
@@ -52,6 +54,7 @@ def parse_document(
         )
 
     except Exception as e:
+        print("error uploading file, stacktrace: ", traceback.format_exc())
         logger.error(
             f"error uploading file, stacktrace: {traceback.format_exc()}",
             exc_info=True,
@@ -65,8 +68,7 @@ def parse_document(
 
 def main():
     logger.info("Starting ingestor service..")
-    app.run(host="0.0.0.0", port=5001, debug=False)
-
+    app.run(host="0.0.0.0", port=5001, debug=True)
 
 if __name__ == "__main__":
     main()
 
@@ -33,6 +33,7 @@ def ingest_document(
         logger.info(f"Parsing {mime_type} at {doc_location} with name {doc_name}")
         if mime_type == "application/pdf":
             logger.info("using pdf parser")
+            print("testing..", parse_options)
             pdfi = pdf_ingestor.PDFIngestor(doc_location, parse_options)
             return_dict = pdfi.return_dict
         elif mime_type in {"text/markdown", "text/x-markdown"}:
 
@@ -7,12 +7,9 @@
 
 from bs4 import BeautifulSoup
 
-from . import table_builder
 from nlm_ingestor.file_parser import pdf_file_parser
 from timeit import default_timer
-from .visual_ingestor import table_parser
 from .visual_ingestor import visual_ingestor
-from nlm_ingestor.ingestor.visual_ingestor import block_renderer
 from nlm_ingestor.ingestor.visual_ingestor.new_indent_parser import NewIndentParser
 from nlm_ingestor.ingestor_utils.utils import NpEncoder, \
     detect_block_center_aligned, detect_block_center_of_page
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ def __init__(self):`
`14`	`14`
`15`	`15`	`def parse_to_html(self, filepath, do_ocr=False):`
`16`	`16`	`# Turn off OCR by default`
`17`		`- timeout = 9000`
	`17`	`+ timeout = 3000`
`18`	`18`	`headers = {`
`19`	`19`	`"X-Tika-OCRskipOcr": "true"`
`20`	`20`	`}`