-
Notifications
You must be signed in to change notification settings - Fork 136
/
Copy pathcontents.py
81 lines (68 loc) · 2.95 KB
/
contents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import time
import PyPDF2
import docx
import readability
from langdetect import detect
from newspaper import fulltext, Article
from selenium import webdriver
def web_crawler_newspaper(url: str) -> tuple[list[str], str]:
"""Run the web crawler."""
raw_html, lang = _get_raw_html(url)
try:
text = fulltext(raw_html, language=lang)
except:
article = Article(url)
article.download()
article.parse()
text = article.text
contents = [text.strip() for text in text.splitlines() if text.strip()]
return contents, lang
def _get_raw_html(url):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
with webdriver.Chrome(options=chrome_options) as driver:
driver.get(url)
print("Please wait for 5 seconds until the webpage finishes loading.")
time.sleep(5)
html = driver.page_source
doc = readability.Document(html)
html = doc.summary()
lang = detect(html)
return html, lang[0:2]
def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a PDF file."""
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
contents = []
for page in pdf_reader.pages:
page_text = page.extract_text().strip()
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
new_text = ''
for text in raw_text:
new_text += text
if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', ';', ':', ':', '”', '’', ')', '】', '》', '」',
'』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']:
contents.append(new_text)
new_text = ''
if new_text:
contents.append(new_text)
lang = detect('\n'.join(contents))
return contents, lang[0:2]
def extract_text_from_txt(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a TXT file."""
with open(file_path, 'r', encoding='utf-8') as f:
contents = [text.strip() for text in f.readlines() if text.strip()]
lang = detect('\n'.join(contents))
return contents, lang[0:2]
def extract_text_from_docx(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a DOCX file."""
document = docx.Document(file_path)
contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
lang = detect('\n'.join(contents))
return contents, lang[0:2]