add more generic image reading (#28)

fcakyon · web-flow · commit 3a31dc821230 · 2021-04-07T17:26:12.000+03:00
* add more generic image reading

* update readme
diff --git a/README.md b/README.md
@@ -89,7 +89,6 @@ prediction_result = get_prediction(
 
 # export detected text regions
 exported_file_paths = export_detected_regions(
-    image_path=image_path,
     image=image,
     regions=prediction_result["boxes"],
     output_dir=output_dir,
@@ -98,7 +97,6 @@ exported_file_paths = export_detected_regions(
 
 # export heatmap, detection points, box visualization
 export_extra_results(
-    image_path=image_path,
     image=image,
     regions=prediction_result["boxes"],
     heatmaps=prediction_result["heatmaps"],
diff --git a/craft_text_detector/__init__.py b/craft_text_detector/__init__.py
@@ -1,12 +1,14 @@
 from __future__ import absolute_import
 
+import os
+
 import craft_text_detector.craft_utils as craft_utils
 import craft_text_detector.file_utils as file_utils
 import craft_text_detector.image_utils as image_utils
 import craft_text_detector.predict as predict
 import craft_text_detector.torch_utils as torch_utils
 
-__version__ = "0.3.3"
+__version__ = "0.3.4"
 
 
 __all__ = [
@@ -45,7 +47,6 @@ def __init__(
     ):
         """
         Arguments:
-            image_path: path to the image to be processed
             output_dir: path to the results to be exported
             rectify: rectify detected polygon by affine transform
             export_extra: export heatmap, detection points, box visualization
@@ -102,21 +103,26 @@ def unload_refinenet_model(self):
         self.refine_net = None
         empty_cuda_cache()
 
-    def detect_text(self, image_path):
+    def detect_text(self, image, image_path=None):
         """
         Arguments:
-            image_path: path to the image to be processed
+            image: path to the image to be processed or numpy array or PIL image
+
         Output:
-            {"masks": lists of predicted masks 2d as bool array,
-            "boxes": list of coords of points of predicted boxes,
-            "boxes_as_ratios": list of coords of points of predicted boxes as ratios of image size,
-            "polys_as_ratios": list of coords of points of predicted polys as ratios of image size,
-            "heatmaps": visualization of the detected characters/links,
-            "text_crop_paths": list of paths of the exported text boxes/polys,
-            "times": elapsed times of the sub modules, in seconds}
+            {
+                "masks": lists of predicted masks 2d as bool array,
+                "boxes": list of coords of points of predicted boxes,
+                "boxes_as_ratios": list of coords of points of predicted boxes as ratios of image size,
+                "polys_as_ratios": list of coords of points of predicted polys as ratios of image size,
+                "heatmaps": visualization of the detected characters/links,
+                "text_crop_paths": list of paths of the exported text boxes/polys,
+                "times": elapsed times of the sub modules, in seconds
+            }
         """
-        # load image
-        image = read_image(image_path)
+
+        if image_path is not None:
+            print("Argument 'image_path' is deprecated, use 'image' instead.")
+            image = image_path
 
         # perform prediction
         prediction_result = get_prediction(
@@ -142,10 +148,14 @@ def detect_text(self, image_path):
         prediction_result["text_crop_paths"] = []
         if self.output_dir is not None:
             # export detected text regions
+            if type(image) == str:
+                file_name, file_ext = os.path.splitext(os.path.basename(image))
+            else:
+                file_name = "image"
             exported_file_paths = export_detected_regions(
-                image_path=image_path,
                 image=image,
                 regions=regions,
+                file_name=file_name,
                 output_dir=self.output_dir,
                 rectify=self.rectify,
             )
@@ -154,10 +164,10 @@ def detect_text(self, image_path):
             # export heatmap, detection points, box visualization
             if self.export_extra:
                 export_extra_results(
-                    image_path=image_path,
                     image=image,
                     regions=regions,
                     heatmaps=prediction_result["heatmaps"],
+                    file_name=file_name,
                     output_dir=self.output_dir,
                 )
 
diff --git a/craft_text_detector/file_utils.py b/craft_text_detector/file_utils.py
@@ -5,6 +5,8 @@
 import gdown
 import numpy as np
 
+from craft_text_detector.image_utils import read_image
+
 
 def download(url: str, save_path: str):
     """
@@ -158,22 +160,27 @@ def export_detected_region(image, poly, file_path, rectify=True):
 
 
 def export_detected_regions(
-    image_path, image, regions, output_dir: str = "output/", rectify: bool = False
+    image,
+    regions,
+    file_name: str = "image",
+    output_dir: str = "output/",
+    rectify: bool = False,
 ):
     """
     Arguments:
-        image_path: path to original image
-        image: full/original image
+        image: path to the image to be processed or numpy array or PIL image
         regions: list of bboxes or polys
+        file_name (str): export image file name
         output_dir: folder to be exported
         rectify: rectify detected polygon by affine transform
     """
+
+    # read/convert image
+    image = read_image(image)
+
     # deepcopy image so that original is not altered
     image = copy.deepcopy(image)
 
-    # get file name
-    file_name, file_ext = os.path.splitext(os.path.basename(image_path))
-
     # create crops dir
     crops_dir = os.path.join(output_dir, file_name + "_crops")
     create_dir(crops_dir)
@@ -194,34 +201,32 @@ def export_detected_regions(
 
 
 def export_extra_results(
-    image_path,
     image,
     regions,
     heatmaps,
+    file_name: str = "image",
     output_dir="output/",
     verticals=None,
     texts=None,
 ):
-    """ save text detection result one by one
+    """save text detection result one by one
     Args:
-        image_path (str): image file name
-        image (array): raw image context
+        image: path to the image to be processed or numpy array or PIL image
+        file_name (str): export image file name
         boxes (array): array of result file
             Shape: [num_detections, 4] for BB output / [num_detections, 4]
             for QUAD output
     Return:
         None
     """
-    image = np.array(image)
-
-    # make result file list
-    filename, file_ext = os.path.splitext(os.path.basename(image_path))
+    # read/convert image
+    image = read_image(image)
 
     # result directory
-    res_file = os.path.join(output_dir, filename + "_text_detection.txt")
-    res_img_file = os.path.join(output_dir, filename + "_text_detection.png")
-    text_heatmap_file = os.path.join(output_dir, filename + "_text_score_heatmap.png")
-    link_heatmap_file = os.path.join(output_dir, filename + "_link_score_heatmap.png")
+    res_file = os.path.join(output_dir, file_name + "_text_detection.txt")
+    res_img_file = os.path.join(output_dir, file_name + "_text_detection.png")
+    text_heatmap_file = os.path.join(output_dir, file_name + "_text_score_heatmap.png")
+    link_heatmap_file = os.path.join(output_dir, file_name + "_link_score_heatmap.png")
 
     # create output dir
     create_dir(output_dir)
diff --git a/craft_text_detector/image_utils.py b/craft_text_detector/image_utils.py
@@ -7,14 +7,24 @@
 import numpy as np
 
 
-def read_image(img_file):
-    img = cv2.imread(img_file)
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    # following two cases are not explained in the original repo
-    if img.shape[0] == 2:
-        img = img[0]
-    if img.shape[2] == 4:
-        img = img[:, :, :3]
+def read_image(image):
+    if type(image) == str:
+        img = cv2.imread(image)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+    elif type(image) == bytes:
+        nparr = np.frombuffer(image, np.uint8)
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+    elif type(image) == np.ndarray:
+        if len(image.shape) == 2:  # grayscale
+            img = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+        elif len(image.shape) == 3 and image.shape[2] == 3:  # BGRscale
+            img = image
+        elif len(image.shape) == 3 and image.shape[2] == 4:  # RGBAscale
+            img = image[:, :, :3]
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
 
     return img
 
diff --git a/craft_text_detector/predict.py b/craft_text_detector/predict.py
@@ -22,7 +22,7 @@ def get_prediction(
 ):
     """
     Arguments:
-        image: image to be processed
+        image: path to the image to be processed or numpy array or PIL image
         output_dir: path to the results to be exported
         craft_net: craft net model
         refine_net: refine net model
@@ -43,6 +43,9 @@ def get_prediction(
     """
     t0 = time.time()
 
+    # read/convert image
+    image = image_utils.read_image(image)
+
     # resize
     img_resized, target_ratio, size_heatmap = image_utils.resize_aspect_ratio(
         image, long_size, interpolation=cv2.INTER_LINEAR
diff --git a/tests/test_craft.py b/tests/test_craft.py
@@ -1,4 +1,5 @@
 import unittest
+
 from craft_text_detector import Craft
 
 
@@ -75,7 +76,7 @@ def test_detect_text(self):
             crop_type="poly",
         )
         # detect text
-        prediction_result = craft.detect_text(image_path=self.image_path)
+        prediction_result = craft.detect_text(image=self.image_path)
 
         self.assertEqual(len(prediction_result["boxes"]), 52)
         self.assertEqual(len(prediction_result["boxes"][0]), 4)
@@ -96,7 +97,7 @@ def test_detect_text(self):
             crop_type="poly",
         )
         # detect text
-        prediction_result = craft.detect_text(image_path=self.image_path)
+        prediction_result = craft.detect_text(image=self.image_path)
 
         self.assertEqual(len(prediction_result["boxes"]), 19)
         self.assertEqual(len(prediction_result["boxes"][0]), 4)
@@ -117,7 +118,7 @@ def test_detect_text(self):
             crop_type="box",
         )
         # detect text
-        prediction_result = craft.detect_text(image_path=self.image_path)
+        prediction_result = craft.detect_text(image=self.image_path)
 
         self.assertEqual(len(prediction_result["boxes"]), 52)
         self.assertEqual(len(prediction_result["boxes"][0]), 4)
@@ -138,7 +139,7 @@ def test_detect_text(self):
             crop_type="box",
         )
         # detect text
-        prediction_result = craft.detect_text(image_path=self.image_path)
+        prediction_result = craft.detect_text(image=self.image_path)
 
         self.assertEqual(len(prediction_result["boxes"]), 19)
         self.assertEqual(len(prediction_result["boxes"][0]), 4)
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -3,12 +3,12 @@
 from tempfile import TemporaryDirectory
 
 from craft_text_detector import (
-    read_image,
-    load_craftnet_model,
-    load_refinenet_model,
-    get_prediction,
     export_detected_regions,
     export_extra_results,
+    get_prediction,
+    load_craftnet_model,
+    load_refinenet_model,
+    read_image,
 )
 
 
@@ -20,18 +20,17 @@ def test_load_craftnet_model(self):
         self.assertTrue(craft_net)
 
         with TemporaryDirectory() as dir_name:
-            weight_path = Path(dir_name, 'weights.pth')
+            weight_path = Path(dir_name, "weights.pth")
             self.assertFalse(weight_path.is_file())
             load_craftnet_model(cuda=False, weight_path=weight_path)
             self.assertTrue(weight_path.is_file())
 
-
     def test_load_refinenet_model(self):
         refine_net = load_refinenet_model(cuda=False)
         self.assertTrue(refine_net)
 
         with TemporaryDirectory() as dir_name:
-            weight_path = Path(dir_name, 'weights.pth')
+            weight_path = Path(dir_name, "weights.pth")
             self.assertFalse(weight_path.is_file())
             load_refinenet_model(cuda=False, weight_path=weight_path)
             self.assertTrue(weight_path.is_file())
@@ -73,6 +72,39 @@ def test_get_prediction(self):
             prediction_result["heatmaps"]["text_score_heatmap"].shape, (240, 368, 3)
         )
 
+    def test_get_prediction_without_read_image(self):
+        # set image filepath
+        image = self.image_path
+
+        # load models
+        craft_net = load_craftnet_model()
+        refine_net = None
+
+        # perform prediction
+        text_threshold = 0.9
+        link_threshold = 0.2
+        low_text = 0.2
+        cuda = False
+        prediction_result = get_prediction(
+            image=image,
+            craft_net=craft_net,
+            refine_net=refine_net,
+            text_threshold=text_threshold,
+            link_threshold=link_threshold,
+            low_text=low_text,
+            cuda=cuda,
+            long_size=720,
+        )
+
+        self.assertEqual(len(prediction_result["boxes"]), 35)
+        self.assertEqual(len(prediction_result["boxes"][0]), 4)
+        self.assertEqual(len(prediction_result["boxes"][0][0]), 2)
+        self.assertEqual(int(prediction_result["boxes"][0][0][0]), 111)
+        self.assertEqual(len(prediction_result["polys"]), 35)
+        self.assertEqual(
+            prediction_result["heatmaps"]["text_score_heatmap"].shape, (240, 368, 3)
+        )
+
 
 if __name__ == "__main__":
     unittest.main()