Skip to content

Commit 4f8a964

Browse files
committed
❇️ Improve the detection around some cases
Close #365 #357 #356
1 parent 165211a commit 4f8a964

File tree

4 files changed

+8
-7
lines changed

4 files changed

+8
-7
lines changed

bin/coverage.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import List
66
import argparse
77

8-
from charset_normalizer import from_path
8+
from charset_normalizer import from_path, __version__
99
from charset_normalizer.utils import iana_name
1010

1111
from os import sep
@@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
4040
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
4141
exit(1)
4242

43+
print(f"> using charset-normalizer {__version__}")
44+
4345
success_count = 0
4446
total_count = 0
4547

charset_normalizer/md.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover
233233

234234
@property
235235
def ratio(self) -> float:
236-
if self._character_count == 0:
236+
if self._character_count <= 32:
237237
return 0.0
238238

239239
ratio_of_suspicious_range_usage: float = (
240240
self._suspicious_successive_range_count * 2
241241
) / self._character_count
242242

243-
if ratio_of_suspicious_range_usage < 0.1:
244-
return 0.0
245-
246243
return ratio_of_suspicious_range_usage
247244

248245

@@ -521,6 +518,8 @@ def is_suspiciously_successive_range(
521518
return False
522519
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
523520
return False
521+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
522+
return False
524523

525524
return True
526525

charset_normalizer/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
9696
if character_range is None:
9797
return False
9898

99-
return "Forms" in character_range
99+
return "Forms" in character_range and character_category != "Lo"
100100

101101

102102
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

charset_normalizer/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
Expose version
33
"""
44

5-
__version__ = "3.3.0"
5+
__version__ = "3.3.1"
66
VERSION = __version__.split(".")

0 commit comments

Comments
 (0)