-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
146 lines (135 loc) · 4.18 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import webapp2
from webapp2_extras import jinja2
import urllib
from google.appengine.api import urlfetch
from datetime import date, timedelta
from HTMLParser import HTMLParser
import json
import logging
HP_KEY = '08002bd7-6107-41a6-8aef-d067dc823d70'
PARSER_TOKEN = '841f04fafb770c310d4aa9876a10a155ea922817'
initial_date = date.today() - timedelta(days = 30)
def get_relative_news(text):
"""
Returns a list of news related to the text
"""
params = {
'indexes': 'news_eng',
'text': text.encode('utf8'),
'min_date': initial_date.strftime('%Y-%m-%d'),
'apikey': HP_KEY,
'summary': 'context'
}
form_data = urllib.urlencode(params)
resp = urlfetch.fetch(
url = "https://api.idolondemand.com/1/api/sync/querytextindex/v1?%s" % form_data,
method= urlfetch.GET,
deadline=60
)
resp = json.loads(resp.content)
logging.info("### get_relative_news ###")
logging.info(resp)
return resp['documents']
def parser_main_content(url):
"""
Extract the main content of a given URL
"""
params = {
'url' : url,
'token' : PARSER_TOKEN,
}
form_data = urllib.urlencode(params)
resp = urlfetch.fetch(
url = "https://readability.com/api/content/v1/parser?%s" % form_data,
method = urlfetch.GET,
deadline=60
)
resp = json.loads(resp.content)
logging.info("### parser_main_content ###")
# logging.info(resp.content)
return resp
class MLStripper(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def handle_charref(self, name):
self.fed.append('&#%s;' % name)
def get_data(self):
return ''.join(self.fed)
def _strip_once(value):
"""
Internal tag stripping utility used by strip_tags.
"""
s = MLStripper()
s.feed(value)
s.close()
return s.get_data()
def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
# Note: in typical case this loop executes _strip_once once. Loop condition
# is redundant, but helps to reduce number of executions of _strip_once.
while '<' in value and '>' in value:
new_value = _strip_once(value)
if new_value == value:
# _strip_once was not able to detect more tags
break
value = new_value
return value
def get_mood(text, lang = 'eng'):
"""
Returns the mood of a given text
"""
params = {
'text': text.encode('utf8'),
'apikey': HP_KEY,
'lang': lang
}
form_data = urllib.urlencode(params)
resp = urlfetch.fetch(
url = "https://api.idolondemand.com/1/api/sync/analyzesentiment/v1",
method = urlfetch.POST,
payload=form_data,
deadline=60
)
resp = json.loads(resp.content)
logging.info("### get_mood ###")
logging.info(resp)
return resp
def get_news_mood(text):
"""
It does the magic.
1. Get all news relative to the text
2. Strip the best content of them
3. Get_mood for those contents
"""
moods = []
for news in get_relative_news(text):
main_content = "%s %s" % (
news['title'],
strip_tags(parser_main_content(news['reference'])['content'].replace('\n',' ').replace(' ',' '))
)
m = get_mood(main_content)
moods.append(
{
'score': m['aggregate']['score'],
'sentiment': m['aggregate']['sentiment'],
'url': news['reference'],
'title': news['title'],
'summary':news['summary']
}
)
return moods
class BaseHandler(webapp2.RequestHandler):
@webapp2.cached_property
def jinja2(self):
# Returns a Jinja2 renderer cached in the app registry.
return jinja2.get_jinja2(app=self.app)
def render_response(self, _template, **context):
# Renders a template and writes the result to the response.
rv = self.jinja2.render_template(_template, **context)
self.response.write(rv)