Skip to content
This repository was archived by the owner on May 6, 2022. It is now read-only.

Commit e38b306

Browse files
committed
regenerating documents
1 parent 5cf7b3a commit e38b306

File tree

6 files changed

+74
-38
lines changed

6 files changed

+74
-38
lines changed

instructor/day_four.pdf

3.1 KB
Binary file not shown.

instructor/day_four.py

Lines changed: 74 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,13 @@
111111

112112
# In[9]:
113113

114-
snippet = 'This is [cough cough] and example of a [really] greedy operator'
114+
snippet = 'This is [cough cough] an example of a [really] greedy operator'
115115
re.findall(r'\[.+\]', snippet)
116116

117117

118118
# Since the operator is greedy, it is matching everything inbetween the first open and the last close bracket. To make `+` consume the least possible amount of string, we'll add a `?`.
119+
#
120+
# > side note - regex workflows typically use `re.compile`, as this allows you to set options called flags that can reduce the verbosity of your pattern, like `re.I` for 'ignore case'.
119121

120122
# In[10]:
121123

@@ -166,6 +168,19 @@
166168
match.group('name'), match.group('line')
167169

168170

171+
# We can also list and count all the unique characters.
172+
173+
# In[14]:
174+
175+
matches = re.findall(p, document)
176+
chars = set([x[0] for x in matches])
177+
178+
179+
# In[15]:
180+
181+
print (chars, len(chars))
182+
183+
169184
# #### Now let's try a small challenge!
170185
#
171186
# To check that you've understood something about regular expressions, we're going to have you do a small test challenge. Partner up with the person next to you - we're going to do this as a pair coding exercise - and choose which computer you are going to use.
@@ -176,7 +191,7 @@
176191
#
177192
# Let's grab Arthur's speech from above, and see what we can learn about Arthur from it.
178193

179-
# In[14]:
194+
# In[16]:
180195

181196
p = re.compile(r'(?:ARTHUR: )(.+)')
182197
arthur = ' '.join(re.findall(p, document))
@@ -185,38 +200,39 @@
185200

186201
# In our model for natural language, we're interested in words. The document is currently a continuous string of bytes, which isn't ideal. You might be tempted to separate this into words using your newfound regex knowledge:
187202

188-
# In[15]:
203+
# In[17]:
189204

190205
p = re.compile(r'\w+', flags=re.I)
191206
re.findall(p, arthur)[0:10]
192207

193208

194209
# But this is problematic for languages that make extensive use of punctuation. For example, see what happens with:
195210

196-
# In[16]:
211+
# In[18]:
197212

198213
re.findall(p, "It isn't Dav's cheesecake that I'm worried about")
199214

200215

201216
# The practice of pulling apart a continuous string into units is called "tokenizing", and it creates "tokens". NLTK, the canonical library for NLP in Python, has a couple of implementations for tokenizing a string into words.
202217

203-
# In[17]:
218+
# In[19]:
204219

220+
#nltk.download('punkt')
205221
from nltk import word_tokenize
206222
word_tokenize("It isn't Dav's cheesecake that I'm worried about")
207223

208224

209225
# The distinction here is subtle, but look at what happened to "isn't". It's been separated into "IS" and "N'T", which is more in keeping with the way contractions work in English.
210226

211-
# In[18]:
227+
# In[20]:
212228

213229
tokens = word_tokenize(arthur)
214230
tokens[0:10]
215231

216232

217233
# At this point, we can start asking questions like what are the most common words, and what words tend to occur together.
218234

219-
# In[19]:
235+
# In[21]:
220236

221237
len(tokens), len(set(tokens))
222238

@@ -229,21 +245,32 @@
229245
#
230246
# For more complicated metrics, it's easier to use NLTK's classes and methods.
231247

232-
# In[20]:
248+
# In[22]:
233249

234250
from nltk import collocations
235251
fd = collocations.FreqDist(tokens)
236252
fd.most_common()[:10]
237253

238254

239-
# In[21]:
255+
# Let's remove puntuation and stopwords.
256+
257+
# In[23]:
258+
259+
from string import punctuation
260+
from nltk.corpus import stopwords
261+
tokens_reduced = [x for x in tokens if x not in punctuation and x not in stopwords.words('english')]
262+
fd2 = collocations.FreqDist(tokens_reduced)
263+
fd2.most_common()[:10]
264+
265+
266+
# In[24]:
240267

241268
measures = collocations.BigramAssocMeasures()
242269
c = collocations.BigramCollocationFinder.from_words(tokens)
243270
c.nbest(measures.pmi, 10)
244271

245272

246-
# In[22]:
273+
# In[25]:
247274

248275
c.nbest(measures.likelihood_ratio, 10)
249276

@@ -260,31 +287,31 @@
260287
#
261288
# Just like the tokenizers, we first have to create a stemmer object with the language we are using.
262289

263-
# In[23]:
290+
# In[26]:
264291

265292
snowball = nltk.SnowballStemmer('english')
266293

267294

268295
# Now, we can try stemming some words
269296

270-
# In[24]:
297+
# In[27]:
271298

272299
snowball.stem('running')
273300

274301

275-
# In[25]:
302+
# In[28]:
276303

277304
snowball.stem('eats')
278305

279306

280-
# In[26]:
307+
# In[29]:
281308

282309
snowball.stem('embarassed')
283310

284311

285312
# Snowball is a very fast algorithm, but it has a lot of edge cases. In some cases, words with the same stem are reduced to two different stems.
286313

287-
# In[27]:
314+
# In[30]:
288315

289316
snowball.stem('cylinder'), snowball.stem('cylindrical')
290317

@@ -293,61 +320,75 @@
293320
#
294321
# > This is sometimes referred to as a 'collision'
295322

296-
# In[28]:
323+
# In[31]:
297324

298325
snowball.stem('vacation'), snowball.stem('vacate')
299326

300327

301-
# In[29]:
328+
# In[32]:
302329

303330
snowball.stem('organization'), snowball.stem('organ')
304331

305332

306-
# In[30]:
333+
# In[33]:
307334

308335
snowball.stem('iron'), snowball.stem('ironic')
309336

310337

311-
# In[31]:
338+
# In[34]:
312339

313340
snowball.stem('vertical'), snowball.stem('vertices')
314341

315342

316343
# A more accurate approach is to use an English word bank like WordNet to call dictionary lookups on word forms, in a process called lemmatization.
317344

318-
# In[32]:
345+
# In[35]:
319346

320347
# nltk.download('wordnet')
321348
wordnet = nltk.WordNetLemmatizer()
322349

323350

324-
# In[33]:
351+
# In[36]:
325352

326353
wordnet.lemmatize('iron'), wordnet.lemmatize('ironic')
327354

328355

329-
# In[34]:
356+
# In[37]:
330357

331358
wordnet.lemmatize('vacation'), wordnet.lemmatize('vacate')
332359

333360

334361
# Nothing comes for free, and you've probably noticed already that the lemmatizer is slower. We can see how much slower with one of IPYthon's `magic functions`.
335362

336-
# In[35]:
363+
# In[38]:
337364

338365
get_ipython().magic("timeit wordnet.lemmatize('table')")
339366

340367

341-
# In[36]:
368+
# In[39]:
342369

343370
4.45 * 5.12
344371

345372

346-
# In[37]:
373+
# In[40]:
347374

348375
get_ipython().magic("timeit snowball.stem('table')")
349376

350377

378+
# Other cool things you can do with WordNet include hypernyms and hyponyms.
379+
380+
# In[41]:
381+
382+
from nltk.corpus import wordnet as wn
383+
dog = wn.synset('dog.n.01')
384+
dog.hypernyms()
385+
386+
387+
# In[42]:
388+
389+
dog.hyponyms()
390+
391+
351392
# #### Time for another small challenge!
352393
#
353394
# Switch computers for this one, so that you are using your partner's computer, and try your hand at challenge B!
@@ -362,17 +403,17 @@
362403
#
363404
# We're going to use TextBlob's built-in sentiment classifier, because it is super easy.
364405

365-
# In[38]:
406+
# In[43]:
366407

367408
from textblob import TextBlob
368409

369410

370-
# In[39]:
411+
# In[44]:
371412

372413
blob = TextBlob(arthur)
373414

374415

375-
# In[40]:
416+
# In[45]:
376417

377418
for sentence in blob.sentences[10:25]:
378419
print(sentence.sentiment.polarity, sentence)
@@ -386,14 +427,14 @@
386427
#
387428
# Luckily for us there is another python library that takes care of the heavy lifting for us.
388429

389-
# In[41]:
430+
# In[46]:
390431

391432
from gensim import corpora, models, similarities
392433

393434

394435
# We already have a document for Arthur, but let's grab the text from someone else to compare it with.
395436

396-
# In[42]:
437+
# In[47]:
397438

398439
p = re.compile(r'(?:GALAHAD: )(.+)')
399440
galahad = ' '.join(re.findall(p, document))
@@ -403,7 +444,7 @@
403444

404445
# Now, we use gensim to create vectors from these tokenized documents:
405446

406-
# In[43]:
447+
# In[48]:
407448

408449
dictionary = corpora.Dictionary([arthur_tokens, galahad_tokens])
409450
corpus = [dictionary.doc2bow(doc) for doc in [arthur_tokens, galahad_tokens]]
@@ -412,15 +453,15 @@
412453

413454
# Then, we create matrix models of our corpus and query
414455

415-
# In[44]:
456+
# In[49]:
416457

417458
query = tfidf[dictionary.doc2bow(['peasant'])]
418459
index = similarities.MatrixSimilarity(tfidf[corpus])
419460

420461

421462
# And finally, we can test our query, "peasant" on the two documents in our corpus
422463

423-
# In[45]:
464+
# In[50]:
424465

425466
list(enumerate(index[query]))
426467

@@ -435,8 +476,3 @@
435476
#
436477
# 1. Is King Arthur happier than Sir Robin, based on his speech?
437478
# 2. Which character in Monty Python has the biggest vocabulary?
438-
439-
# In[46]:
440-
441-
442-

instructor/day_one.pdf

224 Bytes
Binary file not shown.

instructor/day_three.pdf

41.9 KB
Binary file not shown.

instructor/day_two.pdf

33 Bytes
Binary file not shown.

instructor/day_zero.pdf

65 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)