-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_features.py
98 lines (79 loc) · 2.66 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
from datetime import datetime
df = pd.read_csv('test.csv')
with open('test_feature03.csv', 'w', encoding='UTF-8', newline='') as csvfile:
colume = [['author', 'channel', 'img_num', 'video_num', 'link_num',
'paragraph_num', 'words_num', 'words/paragraph', 'tweet_num',
'topic_num', 'month', 'week_day', 'till_today', 'hour',
'if number in title']]
writer = csv.writer(csvfile)
writer.writerows(colume)
# train: 27643, test: 11847
for i in range(11847):
features = []
soup = BeautifulSoup(df.values[i][1], "html.parser")
# Author
author = re.findall(r'/\S*/([^(]*)/"', str(soup.head.a))
if author:
author = author[0]
if not author:
author = soup.head.div.span.span.string[3:].lower()
author = author.replace(' ', '-')
features.append(author)
# Channel
features.append(soup.body.article['data-channel'])
# imgs number
img = soup.find_all('img')
features.append(len(img))
# videos number
video = soup.find_all('iframe')
features.append(len(video))
# links number
link = soup.find_all('a')
features.append(len(link))
# paragraphs number
paragraph = len(soup.find_all('p'))
features.append(paragraph)
# words count
words_count = len(soup.get_text().split())
features.append(words_count)
# words / paragraph
features.append(int(words_count / paragraph))
# tweets number
tweet = soup.find_all('blockquote')
features.append(len(tweet))
# topics number
features.append(len(soup.body.footer.find_all('a')))
# month
try:
features.append(soup.time['datetime'][8:11])
except:
features.append('N')
# week day
try:
features.append(soup.time['datetime'][0:3])
except:
features.append('N')
# days till today
try:
date_format = "%d %b %Y"
day = soup.time['datetime'][5:16]
d0 = datetime.strptime(day, date_format)
d1 = datetime.strptime('01 Jan 2015', date_format)
delta = d1 - d0
features.append(delta.days)
except:
features.append('N')
# hour
try:
features.append('x' + soup.time['datetime'][17:19])
except:
features.append('N')
# if number in title
features.append(int(any(char.isdigit() for char in soup.body.h1.get_text())))
with open('test_feature03.csv', 'a', encoding='UTF-8', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows([features])