Skip to content

Commit 66966f1

Browse files
authored
❇️ Improve the detection around some cases (#366)
Close #365 #357 #356
1 parent 49653a6 commit 66966f1

File tree

6 files changed

+24
-11
lines changed

6 files changed

+24
-11
lines changed

CHANGELOG.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
44

5+
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-??)
6+
7+
### Changed
8+
- Optional mypyc compilation upgraded to version 1.6.0 for Python >= 3.8
9+
- Improved the general detection reliability based on reports from the community
10+
511
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
612

713
### Added
@@ -14,7 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
1420

1521
### Changed
1622
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
17-
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.7
23+
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
1824

1925
### Fixed
2026
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)

bin/coverage.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import List
66
import argparse
77

8-
from charset_normalizer import from_path
8+
from charset_normalizer import from_path, __version__
99
from charset_normalizer.utils import iana_name
1010

1111
from os import sep
@@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
4040
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
4141
exit(1)
4242

43+
print(f"> using charset-normalizer {__version__}")
44+
4345
success_count = 0
4446
total_count = 0
4547

charset_normalizer/md.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover
233233

234234
@property
235235
def ratio(self) -> float:
236-
if self._character_count == 0:
236+
if self._character_count <= 24:
237237
return 0.0
238238

239239
ratio_of_suspicious_range_usage: float = (
240240
self._suspicious_successive_range_count * 2
241241
) / self._character_count
242242

243-
if ratio_of_suspicious_range_usage < 0.1:
244-
return 0.0
245-
246243
return ratio_of_suspicious_range_usage
247244

248245

@@ -295,7 +292,11 @@ def feed(self, character: str) -> None:
295292
self._is_current_word_bad = True
296293
# Word/Buffer ending with an upper case accentuated letter are so rare,
297294
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
298-
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
295+
if (
296+
is_accentuated(self._buffer[-1])
297+
and self._buffer[-1].isupper()
298+
and all(_.isupper() for _ in self._buffer) is False
299+
):
299300
self._foreign_long_count += 1
300301
self._is_current_word_bad = True
301302
if buffer_length >= 24 and self._foreign_long_watch:
@@ -521,6 +522,8 @@ def is_suspiciously_successive_range(
521522
return False
522523
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
523524
return False
525+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
526+
return False
524527

525528
return True
526529

charset_normalizer/utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
9696
if character_range is None:
9797
return False
9898

99-
return "Forms" in character_range
99+
return "Forms" in character_range and character_category != "Lo"
100100

101101

102102
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -106,7 +106,7 @@ def is_emoticon(character: str) -> bool:
106106
if character_range is None:
107107
return False
108108

109-
return "Emoticons" in character_range
109+
return "Emoticons" in character_range or "Pictographs" in character_range
110110

111111

112112
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

charset_normalizer/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
Expose version
33
"""
44

5-
__version__ = "3.3.0"
5+
__version__ = "3.3.1"
66
VERSION = __version__.split(".")

tests/test_edge_case.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from charset_normalizer import from_bytes
2+
import pytest
3+
import platform
24

3-
5+
@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)")
46
def test_unicode_edge_case():
57
payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'
68

0 commit comments

Comments
 (0)