|
111 | 111 |
|
112 | 112 | # In[9]: |
113 | 113 |
|
114 | | -snippet = 'This is [cough cough] and example of a [really] greedy operator' |
| 114 | +snippet = 'This is [cough cough] an example of a [really] greedy operator' |
115 | 115 | re.findall(r'\[.+\]', snippet) |
116 | 116 |
|
117 | 117 |
|
118 | 118 | # Since the operator is greedy, it is matching everything inbetween the first open and the last close bracket. To make `+` consume the least possible amount of string, we'll add a `?`. |
| 119 | +# |
| 120 | +# > side note - regex workflows typically use `re.compile`, as this allows you to set options called flags that can reduce the verbosity of your pattern, like `re.I` for 'ignore case'. |
119 | 121 |
|
120 | 122 | # In[10]: |
121 | 123 |
|
|
166 | 168 | match.group('name'), match.group('line') |
167 | 169 |
|
168 | 170 |
|
| 171 | +# We can also list and count all the unique characters. |
| 172 | + |
| 173 | +# In[14]: |
| 174 | + |
| 175 | +matches = re.findall(p, document) |
| 176 | +chars = set([x[0] for x in matches]) |
| 177 | + |
| 178 | + |
| 179 | +# In[15]: |
| 180 | + |
| 181 | +print (chars, len(chars)) |
| 182 | + |
| 183 | + |
169 | 184 | # #### Now let's try a small challenge! |
170 | 185 | # |
171 | 186 | # To check that you've understood something about regular expressions, we're going to have you do a small test challenge. Partner up with the person next to you - we're going to do this as a pair coding exercise - and choose which computer you are going to use. |
|
176 | 191 | # |
177 | 192 | # Let's grab Arthur's speech from above, and see what we can learn about Arthur from it. |
178 | 193 |
|
179 | | -# In[14]: |
| 194 | +# In[16]: |
180 | 195 |
|
181 | 196 | p = re.compile(r'(?:ARTHUR: )(.+)') |
182 | 197 | arthur = ' '.join(re.findall(p, document)) |
|
185 | 200 |
|
186 | 201 | # In our model for natural language, we're interested in words. The document is currently a continuous string of bytes, which isn't ideal. You might be tempted to separate this into words using your newfound regex knowledge: |
187 | 202 |
|
188 | | -# In[15]: |
| 203 | +# In[17]: |
189 | 204 |
|
190 | 205 | p = re.compile(r'\w+', flags=re.I) |
191 | 206 | re.findall(p, arthur)[0:10] |
192 | 207 |
|
193 | 208 |
|
194 | 209 | # But this is problematic for languages that make extensive use of punctuation. For example, see what happens with: |
195 | 210 |
|
196 | | -# In[16]: |
| 211 | +# In[18]: |
197 | 212 |
|
198 | 213 | re.findall(p, "It isn't Dav's cheesecake that I'm worried about") |
199 | 214 |
|
200 | 215 |
|
201 | 216 | # The practice of pulling apart a continuous string into units is called "tokenizing", and it creates "tokens". NLTK, the canonical library for NLP in Python, has a couple of implementations for tokenizing a string into words. |
202 | 217 |
|
203 | | -# In[17]: |
| 218 | +# In[19]: |
204 | 219 |
|
| 220 | +#nltk.download('punkt') |
205 | 221 | from nltk import word_tokenize |
206 | 222 | word_tokenize("It isn't Dav's cheesecake that I'm worried about") |
207 | 223 |
|
208 | 224 |
|
209 | 225 | # The distinction here is subtle, but look at what happened to "isn't". It's been separated into "IS" and "N'T", which is more in keeping with the way contractions work in English. |
210 | 226 |
|
211 | | -# In[18]: |
| 227 | +# In[20]: |
212 | 228 |
|
213 | 229 | tokens = word_tokenize(arthur) |
214 | 230 | tokens[0:10] |
215 | 231 |
|
216 | 232 |
|
217 | 233 | # At this point, we can start asking questions like what are the most common words, and what words tend to occur together. |
218 | 234 |
|
219 | | -# In[19]: |
| 235 | +# In[21]: |
220 | 236 |
|
221 | 237 | len(tokens), len(set(tokens)) |
222 | 238 |
|
|
229 | 245 | # |
230 | 246 | # For more complicated metrics, it's easier to use NLTK's classes and methods. |
231 | 247 |
|
232 | | -# In[20]: |
| 248 | +# In[22]: |
233 | 249 |
|
234 | 250 | from nltk import collocations |
235 | 251 | fd = collocations.FreqDist(tokens) |
236 | 252 | fd.most_common()[:10] |
237 | 253 |
|
238 | 254 |
|
239 | | -# In[21]: |
| 255 | +# Let's remove puntuation and stopwords. |
| 256 | + |
| 257 | +# In[23]: |
| 258 | + |
| 259 | +from string import punctuation |
| 260 | +from nltk.corpus import stopwords |
| 261 | +tokens_reduced = [x for x in tokens if x not in punctuation and x not in stopwords.words('english')] |
| 262 | +fd2 = collocations.FreqDist(tokens_reduced) |
| 263 | +fd2.most_common()[:10] |
| 264 | + |
| 265 | + |
| 266 | +# In[24]: |
240 | 267 |
|
241 | 268 | measures = collocations.BigramAssocMeasures() |
242 | 269 | c = collocations.BigramCollocationFinder.from_words(tokens) |
243 | 270 | c.nbest(measures.pmi, 10) |
244 | 271 |
|
245 | 272 |
|
246 | | -# In[22]: |
| 273 | +# In[25]: |
247 | 274 |
|
248 | 275 | c.nbest(measures.likelihood_ratio, 10) |
249 | 276 |
|
|
260 | 287 | # |
261 | 288 | # Just like the tokenizers, we first have to create a stemmer object with the language we are using. |
262 | 289 |
|
263 | | -# In[23]: |
| 290 | +# In[26]: |
264 | 291 |
|
265 | 292 | snowball = nltk.SnowballStemmer('english') |
266 | 293 |
|
267 | 294 |
|
268 | 295 | # Now, we can try stemming some words |
269 | 296 |
|
270 | | -# In[24]: |
| 297 | +# In[27]: |
271 | 298 |
|
272 | 299 | snowball.stem('running') |
273 | 300 |
|
274 | 301 |
|
275 | | -# In[25]: |
| 302 | +# In[28]: |
276 | 303 |
|
277 | 304 | snowball.stem('eats') |
278 | 305 |
|
279 | 306 |
|
280 | | -# In[26]: |
| 307 | +# In[29]: |
281 | 308 |
|
282 | 309 | snowball.stem('embarassed') |
283 | 310 |
|
284 | 311 |
|
285 | 312 | # Snowball is a very fast algorithm, but it has a lot of edge cases. In some cases, words with the same stem are reduced to two different stems. |
286 | 313 |
|
287 | | -# In[27]: |
| 314 | +# In[30]: |
288 | 315 |
|
289 | 316 | snowball.stem('cylinder'), snowball.stem('cylindrical') |
290 | 317 |
|
|
293 | 320 | # |
294 | 321 | # > This is sometimes referred to as a 'collision' |
295 | 322 |
|
296 | | -# In[28]: |
| 323 | +# In[31]: |
297 | 324 |
|
298 | 325 | snowball.stem('vacation'), snowball.stem('vacate') |
299 | 326 |
|
300 | 327 |
|
301 | | -# In[29]: |
| 328 | +# In[32]: |
302 | 329 |
|
303 | 330 | snowball.stem('organization'), snowball.stem('organ') |
304 | 331 |
|
305 | 332 |
|
306 | | -# In[30]: |
| 333 | +# In[33]: |
307 | 334 |
|
308 | 335 | snowball.stem('iron'), snowball.stem('ironic') |
309 | 336 |
|
310 | 337 |
|
311 | | -# In[31]: |
| 338 | +# In[34]: |
312 | 339 |
|
313 | 340 | snowball.stem('vertical'), snowball.stem('vertices') |
314 | 341 |
|
315 | 342 |
|
316 | 343 | # A more accurate approach is to use an English word bank like WordNet to call dictionary lookups on word forms, in a process called lemmatization. |
317 | 344 |
|
318 | | -# In[32]: |
| 345 | +# In[35]: |
319 | 346 |
|
320 | 347 | # nltk.download('wordnet') |
321 | 348 | wordnet = nltk.WordNetLemmatizer() |
322 | 349 |
|
323 | 350 |
|
324 | | -# In[33]: |
| 351 | +# In[36]: |
325 | 352 |
|
326 | 353 | wordnet.lemmatize('iron'), wordnet.lemmatize('ironic') |
327 | 354 |
|
328 | 355 |
|
329 | | -# In[34]: |
| 356 | +# In[37]: |
330 | 357 |
|
331 | 358 | wordnet.lemmatize('vacation'), wordnet.lemmatize('vacate') |
332 | 359 |
|
333 | 360 |
|
334 | 361 | # Nothing comes for free, and you've probably noticed already that the lemmatizer is slower. We can see how much slower with one of IPYthon's `magic functions`. |
335 | 362 |
|
336 | | -# In[35]: |
| 363 | +# In[38]: |
337 | 364 |
|
338 | 365 | get_ipython().magic("timeit wordnet.lemmatize('table')") |
339 | 366 |
|
340 | 367 |
|
341 | | -# In[36]: |
| 368 | +# In[39]: |
342 | 369 |
|
343 | 370 | 4.45 * 5.12 |
344 | 371 |
|
345 | 372 |
|
346 | | -# In[37]: |
| 373 | +# In[40]: |
347 | 374 |
|
348 | 375 | get_ipython().magic("timeit snowball.stem('table')") |
349 | 376 |
|
350 | 377 |
|
| 378 | +# Other cool things you can do with WordNet include hypernyms and hyponyms. |
| 379 | + |
| 380 | +# In[41]: |
| 381 | + |
| 382 | +from nltk.corpus import wordnet as wn |
| 383 | +dog = wn.synset('dog.n.01') |
| 384 | +dog.hypernyms() |
| 385 | + |
| 386 | + |
| 387 | +# In[42]: |
| 388 | + |
| 389 | +dog.hyponyms() |
| 390 | + |
| 391 | + |
351 | 392 | # #### Time for another small challenge! |
352 | 393 | # |
353 | 394 | # Switch computers for this one, so that you are using your partner's computer, and try your hand at challenge B! |
|
362 | 403 | # |
363 | 404 | # We're going to use TextBlob's built-in sentiment classifier, because it is super easy. |
364 | 405 |
|
365 | | -# In[38]: |
| 406 | +# In[43]: |
366 | 407 |
|
367 | 408 | from textblob import TextBlob |
368 | 409 |
|
369 | 410 |
|
370 | | -# In[39]: |
| 411 | +# In[44]: |
371 | 412 |
|
372 | 413 | blob = TextBlob(arthur) |
373 | 414 |
|
374 | 415 |
|
375 | | -# In[40]: |
| 416 | +# In[45]: |
376 | 417 |
|
377 | 418 | for sentence in blob.sentences[10:25]: |
378 | 419 | print(sentence.sentiment.polarity, sentence) |
|
386 | 427 | # |
387 | 428 | # Luckily for us there is another python library that takes care of the heavy lifting for us. |
388 | 429 |
|
389 | | -# In[41]: |
| 430 | +# In[46]: |
390 | 431 |
|
391 | 432 | from gensim import corpora, models, similarities |
392 | 433 |
|
393 | 434 |
|
394 | 435 | # We already have a document for Arthur, but let's grab the text from someone else to compare it with. |
395 | 436 |
|
396 | | -# In[42]: |
| 437 | +# In[47]: |
397 | 438 |
|
398 | 439 | p = re.compile(r'(?:GALAHAD: )(.+)') |
399 | 440 | galahad = ' '.join(re.findall(p, document)) |
|
403 | 444 |
|
404 | 445 | # Now, we use gensim to create vectors from these tokenized documents: |
405 | 446 |
|
406 | | -# In[43]: |
| 447 | +# In[48]: |
407 | 448 |
|
408 | 449 | dictionary = corpora.Dictionary([arthur_tokens, galahad_tokens]) |
409 | 450 | corpus = [dictionary.doc2bow(doc) for doc in [arthur_tokens, galahad_tokens]] |
|
412 | 453 |
|
413 | 454 | # Then, we create matrix models of our corpus and query |
414 | 455 |
|
415 | | -# In[44]: |
| 456 | +# In[49]: |
416 | 457 |
|
417 | 458 | query = tfidf[dictionary.doc2bow(['peasant'])] |
418 | 459 | index = similarities.MatrixSimilarity(tfidf[corpus]) |
419 | 460 |
|
420 | 461 |
|
421 | 462 | # And finally, we can test our query, "peasant" on the two documents in our corpus |
422 | 463 |
|
423 | | -# In[45]: |
| 464 | +# In[50]: |
424 | 465 |
|
425 | 466 | list(enumerate(index[query])) |
426 | 467 |
|
|
435 | 476 | # |
436 | 477 | # 1. Is King Arthur happier than Sir Robin, based on his speech? |
437 | 478 | # 2. Which character in Monty Python has the biggest vocabulary? |
438 | | - |
439 | | -# In[46]: |
440 | | - |
441 | | - |
442 | | - |
|
0 commit comments