File tree 5 files changed +157
-5
lines changed
5 files changed +157
-5
lines changed Original file line number Diff line number Diff line change 4
4
"cell_type": "markdown",
5
5
"metadata": {},
6
6
"source": [
7
- "# MLP classification"
7
+ "# Classification using CNN\n",
8
+ "\n",
9
+ "## Getting started"
8
10
]
9
11
},
10
12
{
49
51
"## Data preprocessing"
50
52
]
51
53
},
54
+ {
55
+ "cell_type": "markdown",
56
+ "metadata": {},
57
+ "source": [
58
+ "* Tokenize data"
59
+ ]
60
+ },
52
61
{
53
62
"cell_type": "code",
54
63
"execution_count": 31,
74
83
"sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
75
84
]
76
85
},
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {},
89
+ "source": [
90
+ "* Padding data"
91
+ ]
92
+ },
77
93
{
78
94
"cell_type": "code",
79
95
"execution_count": 32,
87
103
" padding='post', truncating='post')"
88
104
]
89
105
},
106
+ {
107
+ "cell_type": "markdown",
108
+ "metadata": {},
109
+ "source": [
110
+ "* Convert padded data to tf.data.Dataset"
111
+ ]
112
+ },
90
113
{
91
114
"cell_type": "code",
92
115
"execution_count": 33,
103
126
"test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
104
127
]
105
128
},
129
+ {
130
+ "cell_type": "markdown",
131
+ "metadata": {},
132
+ "source": [
133
+ "* Import GloVe embeddings"
134
+ ]
135
+ },
106
136
{
107
137
"cell_type": "code",
108
138
"execution_count": 34,
Original file line number Diff line number Diff line change 4
4
"cell_type" : " markdown" ,
5
5
"metadata" : {},
6
6
"source" : [
7
- " # MLP classification"
7
+ " # Classification using LSTM\n " ,
8
+ " \n " ,
9
+ " ## Getting started"
8
10
]
9
11
},
10
12
{
49
51
" ## Data preprocessing"
50
52
]
51
53
},
54
+ {
55
+ "cell_type" : " markdown" ,
56
+ "metadata" : {},
57
+ "source" : [
58
+ " * Tokenize data"
59
+ ]
60
+ },
52
61
{
53
62
"cell_type" : " code" ,
54
63
"execution_count" : 46 ,
74
83
" sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
75
84
]
76
85
},
86
+ {
87
+ "cell_type" : " markdown" ,
88
+ "metadata" : {},
89
+ "source" : [
90
+ " * Padding texts"
91
+ ]
92
+ },
77
93
{
78
94
"cell_type" : " code" ,
79
95
"execution_count" : 47 ,
87
103
" padding='post', truncating='post')"
88
104
]
89
105
},
106
+ {
107
+ "cell_type" : " markdown" ,
108
+ "metadata" : {},
109
+ "source" : [
110
+ " * Convert padded texts to tf.data.Dataset"
111
+ ]
112
+ },
90
113
{
91
114
"cell_type" : " code" ,
92
115
"execution_count" : 48 ,
103
126
" test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
104
127
]
105
128
},
129
+ {
130
+ "cell_type" : " markdown" ,
131
+ "metadata" : {},
132
+ "source" : [
133
+ " * Import GloVe embeddings"
134
+ ]
135
+ },
106
136
{
107
137
"cell_type" : " code" ,
108
138
"execution_count" : 49 ,
Original file line number Diff line number Diff line change 4
4
"cell_type" : " markdown" ,
5
5
"metadata" : {},
6
6
"source" : [
7
- " # MLP classification"
7
+ " # Classification using GRU"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type" : " markdown" ,
12
+ "metadata" : {},
13
+ "source" : [
14
+ " ## Getting started\n " ,
15
+ " \n " ,
16
+ " Here is handled dependences and imported training/testing data"
8
17
]
9
18
},
10
19
{
49
58
" ## Data preprocessing"
50
59
]
51
60
},
61
+ {
62
+ "cell_type" : " markdown" ,
63
+ "metadata" : {},
64
+ "source" : [
65
+ " * Tokenize data"
66
+ ]
67
+ },
52
68
{
53
69
"cell_type" : " code" ,
54
70
"execution_count" : 24 ,
74
90
" sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
75
91
]
76
92
},
93
+ {
94
+ "cell_type" : " markdown" ,
95
+ "metadata" : {},
96
+ "source" : [
97
+ " * Padding data"
98
+ ]
99
+ },
77
100
{
78
101
"cell_type" : " code" ,
79
102
"execution_count" : 25 ,
87
110
" padding='post', truncating='post')"
88
111
]
89
112
},
113
+ {
114
+ "cell_type" : " markdown" ,
115
+ "metadata" : {},
116
+ "source" : [
117
+ " * Convert padded data to tf.data.Dataset"
118
+ ]
119
+ },
90
120
{
91
121
"cell_type" : " code" ,
92
122
"execution_count" : 26 ,
103
133
" test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
104
134
]
105
135
},
136
+ {
137
+ "cell_type" : " markdown" ,
138
+ "metadata" : {},
139
+ "source" : [
140
+ " * Import GloVe embeddings"
141
+ ]
142
+ },
106
143
{
107
144
"cell_type" : " code" ,
108
145
"execution_count" : 27 ,
Original file line number Diff line number Diff line change 4
4
"cell_type" : " markdown" ,
5
5
"metadata" : {},
6
6
"source" : [
7
- " # MLP classification"
7
+ " # Classification using LSTM and keywords"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type" : " markdown" ,
12
+ "metadata" : {},
13
+ "source" : [
14
+ " ## Getting started"
8
15
]
9
16
},
10
17
{
50
57
" ## Data preprocessing"
51
58
]
52
59
},
60
+ {
61
+ "cell_type" : " markdown" ,
62
+ "metadata" : {},
63
+ "source" : [
64
+ " * Cleaning keywords (fill Nan values and split keywords if needed)"
65
+ ]
66
+ },
53
67
{
54
68
"cell_type" : " code" ,
55
69
"execution_count" : 3 ,
63
77
" df_test['keyword'] = df_test['keyword'].apply(lambda x: re.sub('%20', ' ', x))"
64
78
]
65
79
},
80
+ {
81
+ "cell_type" : " markdown" ,
82
+ "metadata" : {},
83
+ "source" : [
84
+ " * Tokenize texts"
85
+ ]
86
+ },
66
87
{
67
88
"cell_type" : " code" ,
68
89
"execution_count" : 4 ,
92
113
" sequences_keyword_test = tokenizer.texts_to_sequences(df_test['keyword'])"
93
114
]
94
115
},
116
+ {
117
+ "cell_type" : " markdown" ,
118
+ "metadata" : {},
119
+ "source" : [
120
+ " * Padding texts"
121
+ ]
122
+ },
95
123
{
96
124
"cell_type" : " code" ,
97
125
"execution_count" : 5 ,
109
137
" padding='post')"
110
138
]
111
139
},
140
+ {
141
+ "cell_type" : " markdown" ,
142
+ "metadata" : {},
143
+ "source" : [
144
+ " * Convert data to tf.data.Dataset instances"
145
+ ]
146
+ },
112
147
{
113
148
"cell_type" : " code" ,
114
149
"execution_count" : 6 ,
125
160
" test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
126
161
]
127
162
},
163
+ {
164
+ "cell_type" : " markdown" ,
165
+ "metadata" : {},
166
+ "source" : [
167
+ " * Import GloVe embeddings"
168
+ ]
169
+ },
128
170
{
129
171
"cell_type" : " code" ,
130
172
"execution_count" : 7 ,
Original file line number Diff line number Diff line change 312
312
"outputs" : [],
313
313
"source" : [
314
314
" train_embeddings = submodel.predict(train_input)\n " ,
315
- " test_embeddings = submodel.pre"
315
+ " test_embeddings = submodel.predict(test_input)"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type" : " code" ,
320
+ "execution_count" : null ,
321
+ "metadata" : {},
322
+ "outputs" : [],
323
+ "source" : [
324
+ " with open(\" ./data/train_embeddings.pkl\" , 'wb') as f:\n " ,
325
+ " pkl.dump(train_embeddings, f)\n " ,
326
+ " \n " ,
327
+ " with open(\" ./data/test_embeddings.pkl\" , 'wb') as f:\n " ,
328
+ " pkl.dump(test_embeddings, f)"
316
329
]
317
330
}
318
331
],
You can’t perform that action at this time.
0 commit comments