add descriptions

vesran · vesran · commit 02ec0746d056 · 2020-08-01T12:15:55.000+02:00
diff --git a/03_CNN.ipynb b/03_CNN.ipynb
@@ -4,7 +4,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# MLP classification"
+    "# Classification using CNN\n",
+    "\n",
+    "## Getting started"
    ]
   },
   {
@@ -49,6 +51,13 @@
     "## Data preprocessing"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Tokenize data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 31,
@@ -74,6 +83,13 @@
     "sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Padding data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 32,
@@ -87,6 +103,13 @@
     "                                                            padding='post', truncating='post')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Convert padded data to tf.data.Dataset"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 33,
@@ -103,6 +126,13 @@
     "test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Import GloVe embeddings"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 34,
diff --git a/04_LSTM.ipynb b/04_LSTM.ipynb
@@ -4,7 +4,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# MLP classification"
+    "# Classification using LSTM\n",
+    "\n",
+    "## Getting started"
    ]
   },
   {
@@ -49,6 +51,13 @@
     "## Data preprocessing"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Tokenize data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 46,
@@ -74,6 +83,13 @@
     "sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Padding texts"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 47,
@@ -87,6 +103,13 @@
     "                                                            padding='post', truncating='post')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Convert padded texts to tf.data.Dataset"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 48,
@@ -103,6 +126,13 @@
     "test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Import GloVe embeddings"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 49,
diff --git a/05_GRU.ipynb b/05_GRU.ipynb
@@ -4,7 +4,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# MLP classification"
+    "# Classification using GRU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting started\n",
+    "\n",
+    "Here is handled dependences and imported training/testing data"
    ]
   },
   {
@@ -49,6 +58,13 @@
     "## Data preprocessing"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Tokenize data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 24,
@@ -74,6 +90,13 @@
     "sequences_test = tokenizer.texts_to_sequences(df_test['clean_text'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Padding data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 25,
@@ -87,6 +110,13 @@
     "                                                            padding='post', truncating='post')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Convert padded data to tf.data.Dataset"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 26,
@@ -103,6 +133,13 @@
     "test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Import GloVe embeddings"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 27,
diff --git a/06_LSTM+keyword.ipynb b/06_LSTM+keyword.ipynb
@@ -4,7 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# MLP classification"
+    "# Classification using LSTM and keywords"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting started"
    ]
   },
   {
@@ -50,6 +57,13 @@
     "## Data preprocessing"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Cleaning keywords (fill Nan values and split keywords if needed)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -63,6 +77,13 @@
     "df_test['keyword'] = df_test['keyword'].apply(lambda x: re.sub('%20', ' ', x))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Tokenize texts"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -92,6 +113,13 @@
     "sequences_keyword_test = tokenizer.texts_to_sequences(df_test['keyword'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Padding texts"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -109,6 +137,13 @@
     "                                                                          padding='post')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Convert data to tf.data.Dataset instances"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 6,
@@ -125,6 +160,13 @@
     "test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(100)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Import GloVe embeddings"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,
diff --git a/07_BERT_baseline.ipynb b/07_BERT_baseline.ipynb
@@ -312,7 +312,20 @@
    "outputs": [],
    "source": [
     "train_embeddings = submodel.predict(train_input)\n",
-    "test_embeddings = submodel.pre"
+    "test_embeddings = submodel.predict(test_input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"./data/train_embeddings.pkl\", 'wb') as f:\n",
+    "    pkl.dump(train_embeddings, f)\n",
+    "    \n",
+    "with open(\"./data/test_embeddings.pkl\", 'wb') as f:\n",
+    "    pkl.dump(test_embeddings, f)"
    ]
   }
  ],