You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Download necessary NLTK data using the defined base directory
RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords
RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt
RUN pip install awslambdaric
import base64
import json
import tempfile
import os
import traceback
from werkzeug.utils import secure_filename
from nlm_ingestor.ingestor import ingestor_api
from nlm_utils.utils import file_utils
import subprocess
import os
import time
import threading
try:
# Create a temporary file to save the decoded content
tempfile_handler, tmp_file_path = tempfile.mkstemp(suffix=os.path.splitext(filename)[1])
with os.fdopen(tempfile_handler, 'wb') as tmp_file:
tmp_file.write(file_content)
# calculate the file properties
props = file_utils.extract_file_properties(tmp_file_path)
print(f"Parsing document: {filename}")
return_dict, _ = ingestor_api.ingest_document(
filename,
tmp_file_path,
props["mimeType"],
parse_options=parse_options,
)
return return_dict or {}
except Exception as e:
traceback.print_exc()
return {"status": "fail", "reason": str(e)}
finally:
if os.path.exists(tmp_file_path):
os.unlink(tmp_file_path)
def read_output(process):
while True:
output = process.stdout.readline()
if output == '':
break
print(output.strip())
def start_tika():
print('see jar', os.path.exists("jars/tika-server-standard-nlm-modified-2.4.1_v6.jar"))
tika_path = "jars/tika-server-standard-nlm-modified-2.4.1_v6.jar"
java_path = "/usr/bin/java" # Use the common path for Java
process = subprocess.Popen([java_path, "-jar", tika_path],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
# thread = threading.Thread(target=read_output, args=(process,))
# thread.start()
# Main thread can perform other tasks here, or wait for the output thread to finish
# thread.join()
print("Tika Server process completed.")
Call this function early in your Lambda handler
import requests
def test_tika():
try:
response = requests.get('http://localhost:9998/tika')
if response.status_code == 200:
print("Tika Server is reachable and ready!")
return True
else:
print("Tika Server is not ready. Status Code:", response.status_code)
return False
except Exception as e:
print("Failed to connect to Tika Server:", str(e))
return False
def parse(event, context):
print(context)
if 'body' not in event:
return {
"statusCode": 400,
"body": json.dumps({"message": "No data provided"})
}
start_tika()
working = test_tika()
while not working:
time.sleep(3)
working = test_tika()
# Decode the file from base64
file_content = base64.b64decode(event['body'])
filename = "uploaded_document.pdf" # This needs to be passed or inferred some way
# Extract additional parameters
params = event.get('queryStringParameters', {})
render_format = params.get('render_format', 'all')
use_new_indent_parser = params.get('use_new_indent_parser', 'no') == 'yes'
apply_ocr = params.get('apply_ocr', 'no') == 'yes'
# Process the document
result = parse_document(
file_content, filename, render_format, use_new_indent_parser, apply_ocr
)
return {
"statusCode": 200,
"return_dict": result
}
The text was updated successfully, but these errors were encountered:
Why does the container need to know the list of known_hosts? Does it communicate or fetch any information from these known_hosts? Why the hosts in github.com though?
Dockerfile
syntax=docker/dockerfile:experimental
FROM python:3.11-bookworm
RUN apt-get update && apt-get -y --no-install-recommends install libgomp1
ENV APP_HOME /app
install Java
RUN mkdir -p /usr/share/man/man1 &&
apt-get update -y &&
apt-get install -y openjdk-17-jre-headless
install essential packages
RUN apt-get install -y
libxml2-dev libxslt-dev
build-essential libmagic-dev
install tesseract
RUN apt-get install -y
tesseract-ocr
lsb-release
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null
&& apt-get update -oAcquire::AllowInsecureRepositories=true
&& apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated
&& apt-get update
&& apt-get install -y
tesseract-ocr libtesseract-dev
&& wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
RUN apt-get install unzip -y &&
apt-get install git -y &&
apt-get autoremove -y
WORKDIR ${APP_HOME}
COPY ./requirements.txt ./requirements.txt
RUN pip install --upgrade pip setuptools
RUN apt-get install -y libmagic1
RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
RUN pip install -r requirements.txt
Set NLTK Data directory environment variable to ensure it uses a known location
RUN mkdir -p /usr/local/share/nltk_data && chmod a+rwx /usr/local/share/nltk_data
ENV NLTK_DATA /usr/local/share/nltk_data
Download necessary NLTK data using the defined base directory
RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords
RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt
RUN pip install awslambdaric
COPY . ./
ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
Set up the command for the Lambda handler
CMD [ "handler.parse" ]
handler.py
import base64
import json
import tempfile
import os
import traceback
from werkzeug.utils import secure_filename
from nlm_ingestor.ingestor import ingestor_api
from nlm_utils.utils import file_utils
import subprocess
import os
import time
import threading
def parse_document(file_content, filename, render_format="all", use_new_indent_parser=False, apply_ocr=False):
parse_options = {
"parse_and_render_only": True,
"render_format": render_format,
"use_new_indent_parser": use_new_indent_parser,
"parse_pages": (),
"apply_ocr": apply_ocr
}
def read_output(process):
while True:
output = process.stdout.readline()
if output == '':
break
print(output.strip())
def start_tika():
print('see jar', os.path.exists("jars/tika-server-standard-nlm-modified-2.4.1_v6.jar"))
tika_path = "jars/tika-server-standard-nlm-modified-2.4.1_v6.jar"
java_path = "/usr/bin/java" # Use the common path for Java
process = subprocess.Popen([java_path, "-jar", tika_path],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
# thread = threading.Thread(target=read_output, args=(process,))
# thread.start()
Call this function early in your Lambda handler
import requests
def test_tika():
try:
response = requests.get('http://localhost:9998/tika')
if response.status_code == 200:
print("Tika Server is reachable and ready!")
return True
else:
print("Tika Server is not ready. Status Code:", response.status_code)
return False
except Exception as e:
print("Failed to connect to Tika Server:", str(e))
return False
def parse(event, context):
print(context)
if 'body' not in event:
return {
"statusCode": 400,
"body": json.dumps({"message": "No data provided"})
}
start_tika()
The text was updated successfully, but these errors were encountered: