For anyone hoping to deploy this as a lambda #56

dgonier · 2024-04-22T20:49:31Z

Dockerfile

syntax=docker/dockerfile:experimental

FROM python:3.11-bookworm
RUN apt-get update && apt-get -y --no-install-recommends install libgomp1
ENV APP_HOME /app

install Java

RUN mkdir -p /usr/share/man/man1 &&
apt-get update -y &&
apt-get install -y openjdk-17-jre-headless

install essential packages

RUN apt-get install -y
libxml2-dev libxslt-dev
build-essential libmagic-dev

install tesseract

RUN apt-get install -y
tesseract-ocr
lsb-release
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null
&& apt-get update -oAcquire::AllowInsecureRepositories=true
&& apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated
&& apt-get update
&& apt-get install -y
tesseract-ocr libtesseract-dev
&& wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
RUN apt-get install unzip -y &&
apt-get install git -y &&
apt-get autoremove -y
WORKDIR ${APP_HOME}
COPY ./requirements.txt ./requirements.txt
RUN pip install --upgrade pip setuptools
RUN apt-get install -y libmagic1
RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
RUN pip install -r requirements.txt

Set NLTK Data directory environment variable to ensure it uses a known location

RUN mkdir -p /usr/local/share/nltk_data && chmod a+rwx /usr/local/share/nltk_data
ENV NLTK_DATA /usr/local/share/nltk_data

Download necessary NLTK data using the defined base directory

RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords
RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt
RUN pip install awslambdaric

COPY . ./

ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]

Set up the command for the Lambda handler

CMD [ "handler.parse" ]

handler.py

import base64
import json
import tempfile
import os
import traceback
from werkzeug.utils import secure_filename
from nlm_ingestor.ingestor import ingestor_api
from nlm_utils.utils import file_utils
import subprocess
import os
import time
import threading

def parse_document(file_content, filename, render_format="all", use_new_indent_parser=False, apply_ocr=False):
parse_options = {
"parse_and_render_only": True,
"render_format": render_format,
"use_new_indent_parser": use_new_indent_parser,
"parse_pages": (),
"apply_ocr": apply_ocr
}

try:
    # Create a temporary file to save the decoded content
    tempfile_handler, tmp_file_path = tempfile.mkstemp(suffix=os.path.splitext(filename)[1])
    with os.fdopen(tempfile_handler, 'wb') as tmp_file:
        tmp_file.write(file_content)

    # calculate the file properties
    props = file_utils.extract_file_properties(tmp_file_path)
    print(f"Parsing document: {filename}")
    return_dict, _ = ingestor_api.ingest_document(
        filename,
        tmp_file_path,
        props["mimeType"],
        parse_options=parse_options,
    )
    return return_dict or {}

except Exception as e:
    traceback.print_exc()
    return {"status": "fail", "reason": str(e)}

finally:
    if os.path.exists(tmp_file_path):
        os.unlink(tmp_file_path)

def read_output(process):
while True:
output = process.stdout.readline()
if output == '':
break
print(output.strip())

def start_tika():
print('see jar', os.path.exists("jars/tika-server-standard-nlm-modified-2.4.1_v6.jar"))
tika_path = "jars/tika-server-standard-nlm-modified-2.4.1_v6.jar"
java_path = "/usr/bin/java" # Use the common path for Java
process = subprocess.Popen([java_path, "-jar", tika_path],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
# thread = threading.Thread(target=read_output, args=(process,))
# thread.start()

# Main thread can perform other tasks here, or wait for the output thread to finish
# thread.join()
print("Tika Server process completed.")

Call this function early in your Lambda handler

import requests

def test_tika():
try:
response = requests.get('http://localhost:9998/tika')
if response.status_code == 200:
print("Tika Server is reachable and ready!")
return True
else:
print("Tika Server is not ready. Status Code:", response.status_code)
return False
except Exception as e:
print("Failed to connect to Tika Server:", str(e))
return False

def parse(event, context):
print(context)
if 'body' not in event:
return {
"statusCode": 400,
"body": json.dumps({"message": "No data provided"})
}
start_tika()

working = test_tika()
while not working:
    time.sleep(3)
    working = test_tika()

# Decode the file from base64
file_content = base64.b64decode(event['body'])
filename = "uploaded_document.pdf"  # This needs to be passed or inferred some way

# Extract additional parameters
params = event.get('queryStringParameters', {})
render_format = params.get('render_format', 'all')
use_new_indent_parser = params.get('use_new_indent_parser', 'no') == 'yes'
apply_ocr = params.get('apply_ocr', 'no') == 'yes'

# Process the document
result = parse_document(
    file_content, filename, render_format, use_new_indent_parser, apply_ocr
)

return {
    "statusCode": 200,
    "return_dict": result
}

The text was updated successfully, but these errors were encountered:

jpbalarini · 2024-05-17T22:14:40Z

Hi @dgonier do you mind pasting your code again? It seems that the format has been messed up. Thanks!

wei-ann-Github · 2024-08-22T02:55:32Z

Hello, thank you for this Dockerfile.
I would like to ask about this line,

> RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts

Why does the container need to know the list of known_hosts? Does it communicate or fetch any information from these known_hosts? Why the hosts in github.com though?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

For anyone hoping to deploy this as a lambda #56

For anyone hoping to deploy this as a lambda #56

dgonier commented Apr 22, 2024

jpbalarini commented May 17, 2024

wei-ann-Github commented Aug 22, 2024

For anyone hoping to deploy this as a lambda #56

For anyone hoping to deploy this as a lambda #56

Comments

dgonier commented Apr 22, 2024

syntax=docker/dockerfile:experimental

install Java

install essential packages

install tesseract

Set NLTK Data directory environment variable to ensure it uses a known location

Download necessary NLTK data using the defined base directory

Set up the command for the Lambda handler

Call this function early in your Lambda handler

jpbalarini commented May 17, 2024

wei-ann-Github commented Aug 22, 2024