gt-big-data · aadilkhond · Nov 9, 2025
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,5 @@
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
+
+.idea
diff --git a/Analysis/.ipynb_checkpoints/Stinger_LSTM-checkpoint.ipynb b/Analysis/.ipynb_checkpoints/Stinger_LSTM-checkpoint.ipynb
@@ -0,0 +1,318 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8a77807f92f26ee",
+   "metadata": {},
+   "source": [
+    "1. Setup + Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f0f09687eb0be6fa",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-11-01T22:47:31.485735Z",
+     "start_time": "2025-11-01T22:47:31.481782Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import tensorflow as tf\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a4683df52874161",
+   "metadata": {},
+   "source": [
+    "2. Data Preparation\n",
+    "    - loading dataset\n",
+    "    - preprocessing\n",
+    "    - creating sequences\n",
+    "    - splitting data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ac5ec41278b7be4e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-11-01T22:58:20.940274Z",
+     "start_time": "2025-11-01T22:58:20.891444Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(2878, 7)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Distance Between Stations (km)</th>\n",
+       "      <th>Weather Conditions</th>\n",
+       "      <th>Day of the Week</th>\n",
+       "      <th>Time of Day</th>\n",
+       "      <th>Train Type</th>\n",
+       "      <th>Historical Delay (min)</th>\n",
+       "      <th>Route Congestion</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>100</td>\n",
+       "      <td>Clear</td>\n",
+       "      <td>Monday</td>\n",
+       "      <td>Morning</td>\n",
+       "      <td>Express</td>\n",
+       "      <td>5</td>\n",
+       "      <td>Low</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>150</td>\n",
+       "      <td>Rainy</td>\n",
+       "      <td>Tuesday</td>\n",
+       "      <td>Afternoon</td>\n",
+       "      <td>Superfast</td>\n",
+       "      <td>10</td>\n",
+       "      <td>Medium</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>200</td>\n",
+       "      <td>Foggy</td>\n",
+       "      <td>Wednesday</td>\n",
+       "      <td>Evening</td>\n",
+       "      <td>Local</td>\n",
+       "      <td>15</td>\n",
+       "      <td>High</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>50</td>\n",
+       "      <td>Clear</td>\n",
+       "      <td>Thursday</td>\n",
+       "      <td>Night</td>\n",
+       "      <td>Express</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Low</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>75</td>\n",
+       "      <td>Rainy</td>\n",
+       "      <td>Friday</td>\n",
+       "      <td>Morning</td>\n",
+       "      <td>Superfast</td>\n",
+       "      <td>8</td>\n",
+       "      <td>Medium</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Distance Between Stations (km) Weather Conditions Day of the Week  \\\n",
+       "0                             100              Clear          Monday   \n",
+       "1                             150              Rainy         Tuesday   \n",
+       "2                             200              Foggy       Wednesday   \n",
+       "3                              50              Clear        Thursday   \n",
+       "4                              75              Rainy          Friday   \n",
+       "\n",
+       "  Time of Day Train Type  Historical Delay (min) Route Congestion  \n",
+       "0     Morning    Express                       5              Low  \n",
+       "1   Afternoon  Superfast                      10           Medium  \n",
+       "2     Evening      Local                      15             High  \n",
+       "3       Night    Express                       2              Low  \n",
+       "4     Morning  Superfast                       8           Medium  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"data/train_delay_data.csv\")\n",
+    "print(df.shape)\n",
+    "df.head()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac335af4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    \"\"\"\n",
+    "    Turns your DataFrame into LSTM-ready sequences.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - df: your pandas DataFrame.\n",
+    "          ⚠️ The last column should be the target you want to predict.\n",
+    "          All other columns are treated as input features.\n",
+    "          Make sure df is sorted by time (oldest → newest).\n",
+    "    - steps: how many past rows (time steps) to include in each sequence.\n",
+    "             e.g., steps = 24 uses the last 24 hours/days/etc. to predict the next.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - X: NumPy array of shape (num_samples, steps, num_features)\n",
+    "         → the rolling windows of input features.\n",
+    "    - y: NumPy array of shape (num_samples,)\n",
+    "         → the corresponding target values.\n",
+    "    \"\"\"\n",
+    "\n",
+    "\n",
+    "def make_lstm_sequences(df, steps):\n",
+    "    X, y = [], []\n",
+    "    values = df.values  # convert DataFrame to NumPy array\n",
+    "\n",
+    "    for i in range(len(values) - steps):\n",
+    "        # Take the past `steps` rows of all columns except the last (features)\n",
+    "        X.append(values[i:i+steps, :-1])\n",
+    "\n",
+    "        # Take the target value at the next time step (last column)\n",
+    "        y.append(values[i+steps, -1])\n",
+    "\n",
+    "    return np.array(X), np.array(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6714067e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    \"\"\"\n",
+    "    Splits your sequence data into train, validation, and test sets.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - X, y: arrays from make_lstm_sequences()\n",
+    "    - train_ratio: % of data to use for training (default 70%)\n",
+    "    - val_ratio: % of data to use for validation (default 15%)\n",
+    "                 → the rest (15%) automatically becomes test data.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - X_train, y_train\n",
+    "    - X_val, y_val\n",
+    "    - X_test, y_test\n",
+    "    \"\"\"\n",
+    "    \n",
+    "def split_lstm_data(X, y, train_ratio=0.7, val_ratio=0.15):\n",
+    "\n",
+    "\n",
+    "    n = len(X)\n",
+    "    train_end = int(n * train_ratio)\n",
+    "    val_end = int(n * (train_ratio + val_ratio))\n",
+    "\n",
+    "    X_train, y_train = X[:train_end], y[:train_end]\n",
+    "    X_val, y_val = X[train_end:val_end], y[train_end:val_end]\n",
+    "    X_test, y_test = X[val_end:], y[val_end:]\n",
+    "\n",
+    "    return X_train, y_train, X_val, y_val, X_test, y_test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8883303e2711cfa",
+   "metadata": {},
+   "source": [
+    "3. Model set up\n",
+    "    - Define LSTM architecture\n",
+    "    - Initialize model, optimizer, and loss function\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6e20ffcba1ebc7c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40be92eea5ae33e9",
+   "metadata": {},
+   "source": [
+    "4. Training Loop\n",
+    "    - Train model over epochs\n",
+    "    - Track loss and accuracy\n",
+    "    - Save best model (optional)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85429d2f7577e9b5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f0e8f82bc3d5508",
+   "metadata": {},
+   "source": [
+    "5. Evaluation & Results\n",
+    "    - Evaluate on test set\n",
+    "    - Plot training/validation loss curves\n",
+    "    - Visualize predicted vs actual delays\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}