From 86c1b56be11905df15d535c7c203c8ce08073950 Mon Sep 17 00:00:00 2001 From: Liyulingyue <852433440@qq.com> Date: Tue, 16 Apr 2024 19:46:48 +0800 Subject: [PATCH 1/9] modify requestions --- ppstructure/recovery/requirements.txt | 5 ++++- requirements.txt | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 761b9d7c3e..714f5aced2 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,4 +2,7 @@ python-docx beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 -pdf2docx \ No newline at end of file +pdf2docx +lxml +premailer +openpyxl \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d4c95b889f..3b430170c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,9 +10,5 @@ rapidfuzz opencv-python<=4.6.0.66 opencv-contrib-python<=4.6.0.66 cython -lxml -premailer -openpyxl -attrdict Pillow>=10.0.0 pyyaml \ No newline at end of file From 5eb940a04b054f8f8ed50ccc2aa07b8c04dafcab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Tue, 16 Apr 2024 20:02:51 +0800 Subject: [PATCH 2/9] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3b430170c7..3af26908ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ opencv-python<=4.6.0.66 opencv-contrib-python<=4.6.0.66 cython Pillow>=10.0.0 -pyyaml \ No newline at end of file +pyyaml From 55daba74188845aa3b01c19cddc50868227063ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Tue, 16 Apr 2024 20:03:00 +0800 Subject: [PATCH 3/9] Update requirements.txt --- ppstructure/recovery/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 714f5aced2..10d4ae4d66 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -5,4 +5,4 @@ fire>=0.3.0 pdf2docx lxml premailer -openpyxl \ No newline at end of file +openpyxl From 858df6184519b45be9d202eda2f0436906b96196 Mon Sep 17 00:00:00 2001 From: Liyulingyue <852433440@qq.com> Date: Thu, 25 Apr 2024 20:30:45 +0800 Subject: [PATCH 4/9] try import pdfconvert --- paddleocr.py | 2 ++ ppstructure/pdf2word/pdf2word.py | 4 +++- ppstructure/predict_system.py | 2 ++ ppstructure/recovery/requirements.txt | 1 - 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index d03a6932e2..dcac47802e 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -19,6 +19,7 @@ __dir__ = os.path.dirname(__file__) import paddle +from paddle.utils import try_import sys.path.append(os.path.join(__dir__, "")) @@ -910,6 +911,7 @@ def main(): img = cv2.imread(img_path) if args.recovery and args.use_pdf2docx_api and flag_pdf: + try_import("pdf2docx") from pdf2docx.converter import Converter docx_file = os.path.join(args.output, "{}.docx".format(img_name)) diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py index 06ae555598..c9e61eee8f 100644 --- a/ppstructure/pdf2word/pdf2word.py +++ b/ppstructure/pdf2word/pdf2word.py @@ -25,7 +25,6 @@ fitz = try_import("fitz") from PIL import Image -from pdf2docx.converter import Converter from qtpy.QtWidgets import ( QApplication, QWidget, @@ -209,6 +208,9 @@ def run(self): break # using use_pdf2docx_api for PDF parsing if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf": + try_import("pdf2docx") + from pdf2docx.converter import Converter + self.totalPageCnt += 1 self.progressBarRange.emit(self.totalPageCnt) print("===============using use_pdf2docx_api===============") diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 9073e87ee1..6148cc8901 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -28,6 +28,7 @@ import logging from copy import deepcopy +from paddle.utils import try_import from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger from ppocr.utils.visual import draw_ser_results, draw_re_results @@ -300,6 +301,7 @@ def main(args): img_name = os.path.basename(image_file).split(".")[0] if args.recovery and args.use_pdf2docx_api and flag_pdf: + try_import("pdf2docx") from pdf2docx.converter import Converter os.makedirs(args.output, exist_ok=True) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 10d4ae4d66..433f359e44 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,7 +2,6 @@ python-docx beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 -pdf2docx lxml premailer openpyxl From 11cdbdb4bcdd5f6fe303a79eb911ae6ed7680dc3 Mon Sep 17 00:00:00 2001 From: Liyulingyue <852433440@qq.com> Date: Thu, 25 Apr 2024 20:39:00 +0800 Subject: [PATCH 5/9] try import lxml --- ppstructure/table/table_metric/table_metric.py | 6 +++++- ppstructure/table/tablepyxl/tablepyxl.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py index d5ba6a0afb..0ec11f578d 100755 --- a/ppstructure/table/table_metric/table_metric.py +++ b/ppstructure/table/table_metric/table_metric.py @@ -12,10 +12,11 @@ from rapidfuzz.distance import Levenshtein from apted import APTED, Config from apted.helpers import Tree -from lxml import etree, html + from collections import deque from .parallel import parallel_process from tqdm import tqdm +from paddle.utils import try_import class TableTree(Tree): @@ -161,6 +162,9 @@ def evaluate(self, pred, true): """Computes TEDS score between the prediction and the ground truth of a given sample """ + try_import("lxml") + from lxml import etree, html + if (not pred) or (not true): return 0.0 parser = html.HTMLParser(remove_comments=True, encoding="utf-8") diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py index 95b75d1ac7..3124ab66d4 100644 --- a/ppstructure/table/tablepyxl/tablepyxl.py +++ b/ppstructure/table/tablepyxl/tablepyxl.py @@ -1,11 +1,11 @@ # Do imports like python3 so our package works for 2 and 3 from __future__ import absolute_import -from lxml import html from openpyxl import Workbook from openpyxl.utils import get_column_letter from premailer import Premailer from tablepyxl.style import Table +from paddle.utils import try_import def string_to_int(s): @@ -15,6 +15,9 @@ def string_to_int(s): def get_Tables(doc): + try_import("lxml") + from lxml import etree, html + tree = html.fromstring(doc) comments = tree.xpath("//comment()") for comment in comments: From 9ab3e71fbf7294fcb4b95fb3a16af3cd78db14da Mon Sep 17 00:00:00 2001 From: Liyulingyue <852433440@qq.com> Date: Thu, 25 Apr 2024 20:41:06 +0800 Subject: [PATCH 6/9] try import lxml --- ppstructure/recovery/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 433f359e44..1003086628 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,6 +2,5 @@ python-docx beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 -lxml premailer openpyxl From 63d9c78aede1f2abfcf1194ac6d1cc484124fe2e Mon Sep 17 00:00:00 2001 From: Liyulingyue <852433440@qq.com> Date: Thu, 25 Apr 2024 20:44:03 +0800 Subject: [PATCH 7/9] try import premailer --- ppstructure/recovery/requirements.txt | 1 - ppstructure/table/tablepyxl/tablepyxl.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 1003086628..742ff3fde0 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,5 +2,4 @@ python-docx beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 -premailer openpyxl diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py index 3124ab66d4..ac631ee333 100644 --- a/ppstructure/table/tablepyxl/tablepyxl.py +++ b/ppstructure/table/tablepyxl/tablepyxl.py @@ -3,7 +3,7 @@ from openpyxl import Workbook from openpyxl.utils import get_column_letter -from premailer import Premailer + from tablepyxl.style import Table from paddle.utils import try_import @@ -90,6 +90,9 @@ def document_to_workbook(doc, wb=None, base_url=None): every table in the document. The workbook is returned """ + try_import("premailer") + from premailer import Premailer + if not wb: wb = Workbook() wb.remove(wb.active) From d4839b1709b00430764ea404957d0b03af6ba127 Mon Sep 17 00:00:00 2001 From: Liyulingyue <852433440@qq.com> Date: Thu, 25 Apr 2024 21:06:27 +0800 Subject: [PATCH 8/9] try import openpyxl --- ppstructure/recovery/requirements.txt | 1 - ppstructure/table/tablepyxl/style.py | 33 ++++++++++++++---------- ppstructure/table/tablepyxl/tablepyxl.py | 6 +++-- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 742ff3fde0..bad600d728 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,4 +2,3 @@ python-docx beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 -openpyxl diff --git a/ppstructure/table/tablepyxl/style.py b/ppstructure/table/tablepyxl/style.py index 4787e7d377..dfd0f2478a 100644 --- a/ppstructure/table/tablepyxl/style.py +++ b/ppstructure/table/tablepyxl/style.py @@ -1,19 +1,26 @@ # This is where we handle translating css styles into openpyxl styles # and cascading those from parent to child in the dom. -from openpyxl.cell import cell -from openpyxl.styles import ( - Font, - Alignment, - PatternFill, - NamedStyle, - Border, - Side, - Color, -) -from openpyxl.styles.fills import FILL_SOLID -from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE -from openpyxl.styles.colors import BLACK +try: + from openpyxl.cell import cell + from openpyxl.styles import ( + Font, + Alignment, + PatternFill, + NamedStyle, + Border, + Side, + Color, + ) + from openpyxl.styles.fills import FILL_SOLID + from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE + from openpyxl.styles.colors import BLACK +except: + import warnings + + warnings.warn( + "Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure." + ) FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy" diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py index ac631ee333..92f6536052 100644 --- a/ppstructure/table/tablepyxl/tablepyxl.py +++ b/ppstructure/table/tablepyxl/tablepyxl.py @@ -1,8 +1,6 @@ # Do imports like python3 so our package works for 2 and 3 from __future__ import absolute_import -from openpyxl import Workbook -from openpyxl.utils import get_column_letter from tablepyxl.style import Table from paddle.utils import try_import @@ -30,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1): Writes every tr child element of elem to a row in the worksheet returns the next row after all rows are written """ + try_import("openpyxl") from openpyxl.cell.cell import MergedCell + from openpyxl.utils import get_column_letter initial_column = column for table_row in elem.rows: @@ -91,7 +91,9 @@ def document_to_workbook(doc, wb=None, base_url=None): The workbook is returned """ try_import("premailer") + try_import("openpyxl") from premailer import Premailer + from openpyxl import Workbook if not wb: wb = Workbook() From 3eee6bb8423e846b5953720ad988b6b9a3c196af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Thu, 25 Apr 2024 21:08:55 +0800 Subject: [PATCH 9/9] Apply suggestions from code review --- ppstructure/table/table_metric/table_metric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py index 0ec11f578d..327b87adbf 100755 --- a/ppstructure/table/table_metric/table_metric.py +++ b/ppstructure/table/table_metric/table_metric.py @@ -12,7 +12,6 @@ from rapidfuzz.distance import Levenshtein from apted import APTED, Config from apted.helpers import Tree - from collections import deque from .parallel import parallel_process from tqdm import tqdm