Skip to content

Commit 02ec074

Browse files
committed
add descriptions
1 parent 14c7db3 commit 02ec074

File tree

5 files changed

+157
-5
lines changed

5 files changed

+157
-5
lines changed

03_CNN.ipynb

+31-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# MLP classification"
7+
"# Classification using CNN\n",
8+
"\n",
9+
"## Getting started"
810
]
911
},
1012
{
@@ -49,6 +51,13 @@
4951
"## Data preprocessing"
5052
]
5153
},
54+
{
55+
"cell_type": "markdown",
56+
"metadata": {},
57+
"source": [
58+
"* Tokenize data"
59+
]
60+
},
5261
{
5362
"cell_type": "code",
5463
"execution_count": 31,
@@ -74,6 +83,13 @@
7483
"sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
7584
]
7685
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"* Padding data"
91+
]
92+
},
7793
{
7894
"cell_type": "code",
7995
"execution_count": 32,
@@ -87,6 +103,13 @@
87103
" padding='post', truncating='post')"
88104
]
89105
},
106+
{
107+
"cell_type": "markdown",
108+
"metadata": {},
109+
"source": [
110+
"* Convert padded data to tf.data.Dataset"
111+
]
112+
},
90113
{
91114
"cell_type": "code",
92115
"execution_count": 33,
@@ -103,6 +126,13 @@
103126
"test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
104127
]
105128
},
129+
{
130+
"cell_type": "markdown",
131+
"metadata": {},
132+
"source": [
133+
"* Import GloVe embeddings"
134+
]
135+
},
106136
{
107137
"cell_type": "code",
108138
"execution_count": 34,

04_LSTM.ipynb

+31-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# MLP classification"
7+
"# Classification using LSTM\n",
8+
"\n",
9+
"## Getting started"
810
]
911
},
1012
{
@@ -49,6 +51,13 @@
4951
"## Data preprocessing"
5052
]
5153
},
54+
{
55+
"cell_type": "markdown",
56+
"metadata": {},
57+
"source": [
58+
"* Tokenize data"
59+
]
60+
},
5261
{
5362
"cell_type": "code",
5463
"execution_count": 46,
@@ -74,6 +83,13 @@
7483
"sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
7584
]
7685
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"* Padding texts"
91+
]
92+
},
7793
{
7894
"cell_type": "code",
7995
"execution_count": 47,
@@ -87,6 +103,13 @@
87103
" padding='post', truncating='post')"
88104
]
89105
},
106+
{
107+
"cell_type": "markdown",
108+
"metadata": {},
109+
"source": [
110+
"* Convert padded texts to tf.data.Dataset"
111+
]
112+
},
90113
{
91114
"cell_type": "code",
92115
"execution_count": 48,
@@ -103,6 +126,13 @@
103126
"test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
104127
]
105128
},
129+
{
130+
"cell_type": "markdown",
131+
"metadata": {},
132+
"source": [
133+
"* Import GloVe embeddings"
134+
]
135+
},
106136
{
107137
"cell_type": "code",
108138
"execution_count": 49,

05_GRU.ipynb

+38-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,16 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# MLP classification"
7+
"# Classification using GRU"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"## Getting started\n",
15+
"\n",
16+
"Here is handled dependences and imported training/testing data"
817
]
918
},
1019
{
@@ -49,6 +58,13 @@
4958
"## Data preprocessing"
5059
]
5160
},
61+
{
62+
"cell_type": "markdown",
63+
"metadata": {},
64+
"source": [
65+
"* Tokenize data"
66+
]
67+
},
5268
{
5369
"cell_type": "code",
5470
"execution_count": 24,
@@ -74,6 +90,13 @@
7490
"sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
7591
]
7692
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"* Padding data"
98+
]
99+
},
77100
{
78101
"cell_type": "code",
79102
"execution_count": 25,
@@ -87,6 +110,13 @@
87110
" padding='post', truncating='post')"
88111
]
89112
},
113+
{
114+
"cell_type": "markdown",
115+
"metadata": {},
116+
"source": [
117+
"* Convert padded data to tf.data.Dataset"
118+
]
119+
},
90120
{
91121
"cell_type": "code",
92122
"execution_count": 26,
@@ -103,6 +133,13 @@
103133
"test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
104134
]
105135
},
136+
{
137+
"cell_type": "markdown",
138+
"metadata": {},
139+
"source": [
140+
"* Import GloVe embeddings"
141+
]
142+
},
106143
{
107144
"cell_type": "code",
108145
"execution_count": 27,

06_LSTM+keyword.ipynb

+43-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,14 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# MLP classification"
7+
"# Classification using LSTM and keywords"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"## Getting started"
815
]
916
},
1017
{
@@ -50,6 +57,13 @@
5057
"## Data preprocessing"
5158
]
5259
},
60+
{
61+
"cell_type": "markdown",
62+
"metadata": {},
63+
"source": [
64+
"* Cleaning keywords (fill Nan values and split keywords if needed)"
65+
]
66+
},
5367
{
5468
"cell_type": "code",
5569
"execution_count": 3,
@@ -63,6 +77,13 @@
6377
"df_test['keyword'] = df_test['keyword'].apply(lambda x: re.sub('%20', ' ', x))"
6478
]
6579
},
80+
{
81+
"cell_type": "markdown",
82+
"metadata": {},
83+
"source": [
84+
"* Tokenize texts"
85+
]
86+
},
6687
{
6788
"cell_type": "code",
6889
"execution_count": 4,
@@ -92,6 +113,13 @@
92113
"sequences_keyword_test = tokenizer.texts_to_sequences(df_test['keyword'])"
93114
]
94115
},
116+
{
117+
"cell_type": "markdown",
118+
"metadata": {},
119+
"source": [
120+
"* Padding texts"
121+
]
122+
},
95123
{
96124
"cell_type": "code",
97125
"execution_count": 5,
@@ -109,6 +137,13 @@
109137
" padding='post')"
110138
]
111139
},
140+
{
141+
"cell_type": "markdown",
142+
"metadata": {},
143+
"source": [
144+
"* Convert data to tf.data.Dataset instances"
145+
]
146+
},
112147
{
113148
"cell_type": "code",
114149
"execution_count": 6,
@@ -125,6 +160,13 @@
125160
"test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
126161
]
127162
},
163+
{
164+
"cell_type": "markdown",
165+
"metadata": {},
166+
"source": [
167+
"* Import GloVe embeddings"
168+
]
169+
},
128170
{
129171
"cell_type": "code",
130172
"execution_count": 7,

07_BERT_baseline.ipynb

+14-1
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,20 @@
312312
"outputs": [],
313313
"source": [
314314
"train_embeddings = submodel.predict(train_input)\n",
315-
"test_embeddings = submodel.pre"
315+
"test_embeddings = submodel.predict(test_input)"
316+
]
317+
},
318+
{
319+
"cell_type": "code",
320+
"execution_count": null,
321+
"metadata": {},
322+
"outputs": [],
323+
"source": [
324+
"with open(\"./data/train_embeddings.pkl\", 'wb') as f:\n",
325+
" pkl.dump(train_embeddings, f)\n",
326+
" \n",
327+
"with open(\"./data/test_embeddings.pkl\", 'wb') as f:\n",
328+
" pkl.dump(test_embeddings, f)"
316329
]
317330
}
318331
],

0 commit comments

Comments
 (0)