Skip to content
34 changes: 34 additions & 0 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@
PUNCTUATION_RATIO = 0.42
REPEATED_CHARACTER_RATIO = 0.20
IMG_TXT_R_THRES = 0.7

# >>> statistics.mean(result)
# 0.20483261275004847
# >>> statistics.median(result)
# 0.20223865427238322
# >>> statistics.stdev(result)
# 0.031230117152319384
ENTROPY_TOO_LOW = 0.14
ENTROPY_TOO_HIGH = 0.26

EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
RE_COMPILE = regex.compile(EXCEPTION_RE)
COMMON_MALFORMED_PROTOCOLS = [
Expand Down Expand Up @@ -617,6 +627,30 @@ def mostly_img(s, site):
return False, ""


@create_rule("post is likely nonsense", title=False,
sites=["codegolf.stackexchange.com",
"stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com",
"es.stackoverflow.com", "islam.stackexchange.com",
"japanese.stackexchange.com", "anime.stackexchange.com",
"hinduism.stackexchange.com", "judaism.stackexchange.com",
"buddhism.stackexchange.com", "chinese.stackexchange.com",
"french.stackexchange.com", "spanish.stackexchange.com",
"portuguese.stackexchange.com", "korean.stackexchange.com",
"ukrainian.stackexchange.com", "italian.stackexchange.com"],
max_rep=10000, max_score=10000)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to strip code blocks, for instance https://askubuntu.com/a/623972. Or at least collapse repeated whitespace characters.

def nonsense(s, site):
if len(s) == 0:
return False, ""
if "pytest" in sys.modules:
return False, ""
probability = [float(s.count(x)) / len(s) for x in s]
entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)

if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH:
return True, "Entropy per char is {:.4f}".format(entropy_per_char)
return False, ""


# noinspection PyUnusedLocal,PyMissingTypeHints
@create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
def has_repeating_characters(s, site):
Expand Down