-
Notifications
You must be signed in to change notification settings - Fork 290
/
demo_table.py
79 lines (70 loc) · 2.41 KB
/
demo_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
from scattertext.viz.BasicHTMLFromScatterplotStructure import D3URLs
import scattertext as st
import spacy
#nlp = spacy.load('en_core_web_sm')
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')
headline_df = st.SampleCorpora.GuardianHeadlines.get_data().assign(
Parse=lambda df: df.Text.progress_apply(nlp),
MonthNum=lambda df: df.Date.apply(lambda x: x.month),
Month=lambda df: df.Date.apply(lambda x: x.strftime("%B-%Y")),
DateStr=lambda df: df.Date.apply(lambda x: x.strftime("%Y-%m-%d")),
)
word_number_matcher = re.compile('^[A-Za-z0-9 ]+$')
def exclude_ngrams_which_do_not_start_and_end_with_function_words(ngram: spacy.tokens.Span) -> bool:
return any([ngram[0].orth_ in st.MY_ENGLISH_STOP_WORDS,
ngram[-1].orth_ in st.MY_ENGLISH_STOP_WORDS,
word_number_matcher.match(ngram[0].orth_.strip()) is None,
word_number_matcher.match(ngram[-1].orth_.strip()) is None])
corpus = st.OffsetCorpusFactory(
headline_df,
category_col='DateStr',
parsed_col='Parse',
feat_and_offset_getter=st.FlexibleNGramFeatures(
ngram_sizes=[1, 2, 3, 4, 5],
exclude_ngram_filter = exclude_ngrams_which_do_not_start_and_end_with_function_words
)
).build().compact(
compactor=st.NPMICompactor(
minimum_term_count = 3,
number_terms_per_length = 2000,
),
non_text=True
).compact(
st.NgramPercentageCompactor(
usage_portion=0.6,
),
non_text=True
).filter_out(
lambda x: len(x) == 1,
non_text=True
).compact(
compactor=st.AssociationCompactor(
2000,
scorer=st.DeltaJSDivergenceScorer,
term_ranker=st.OncePerDocFrequencyRanker,
use_non_text_features=True,
),
non_text=True
)
html = st.produce_scattertext_table(
corpus=corpus,
category_order=list(sorted(corpus.get_categories())),
all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.LogLikelihoodRatio),
metadata = lambda c: c.get_df()['Date'].astype(str),
ignore_categories=False,
plot_width=1000,
sort_doc_labels_by_name=True,
use_offsets=True,
non_text=True,
header_clickable=True,
d3_url_struct=D3URLs(
d3_url='./scattertext/data/viz/scripts/d3.min.js',
d3_scale_chromatic_url='./scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js'
)
)
fn = 'demo_table.html'
with open(fn, 'w') as of:
of.write(html)
print('open ./' + fn)