diff --git a/.gitignore b/.gitignore index f21726c76..ef705bca9 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,5 @@ npm-debug.log* yarn-debug.log* yarn-error.log* + +.idea diff --git a/Analysis/.ipynb_checkpoints/Stinger_LSTM-checkpoint.ipynb b/Analysis/.ipynb_checkpoints/Stinger_LSTM-checkpoint.ipynb new file mode 100644 index 000000000..e3ff40a09 --- /dev/null +++ b/Analysis/.ipynb_checkpoints/Stinger_LSTM-checkpoint.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8a77807f92f26ee", + "metadata": {}, + "source": [ + "1. Setup + Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f0f09687eb0be6fa", + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-01T22:47:31.485735Z", + "start_time": "2025-11-01T22:47:31.481782Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "8a4683df52874161", + "metadata": {}, + "source": [ + "2. Data Preparation\n", + " - loading dataset\n", + " - preprocessing\n", + " - creating sequences\n", + " - splitting data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ac5ec41278b7be4e", + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-01T22:58:20.940274Z", + "start_time": "2025-11-01T22:58:20.891444Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2878, 7)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distance Between Stations (km)Weather ConditionsDay of the WeekTime of DayTrain TypeHistorical Delay (min)Route Congestion
0100ClearMondayMorningExpress5Low
1150RainyTuesdayAfternoonSuperfast10Medium
2200FoggyWednesdayEveningLocal15High
350ClearThursdayNightExpress2Low
475RainyFridayMorningSuperfast8Medium
\n", + "
" + ], + "text/plain": [ + " Distance Between Stations (km) Weather Conditions Day of the Week \\\n", + "0 100 Clear Monday \n", + "1 150 Rainy Tuesday \n", + "2 200 Foggy Wednesday \n", + "3 50 Clear Thursday \n", + "4 75 Rainy Friday \n", + "\n", + " Time of Day Train Type Historical Delay (min) Route Congestion \n", + "0 Morning Express 5 Low \n", + "1 Afternoon Superfast 10 Medium \n", + "2 Evening Local 15 High \n", + "3 Night Express 2 Low \n", + "4 Morning Superfast 8 Medium " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"data/train_delay_data.csv\")\n", + "print(df.shape)\n", + "df.head()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac335af4", + "metadata": {}, + "outputs": [], + "source": [ + " \"\"\"\n", + " Turns your DataFrame into LSTM-ready sequences.\n", + "\n", + " Inputs:\n", + " - df: your pandas DataFrame.\n", + " ⚠️ The last column should be the target you want to predict.\n", + " All other columns are treated as input features.\n", + " Make sure df is sorted by time (oldest → newest).\n", + " - steps: how many past rows (time steps) to include in each sequence.\n", + " e.g., steps = 24 uses the last 24 hours/days/etc. to predict the next.\n", + "\n", + " Outputs:\n", + " - X: NumPy array of shape (num_samples, steps, num_features)\n", + " → the rolling windows of input features.\n", + " - y: NumPy array of shape (num_samples,)\n", + " → the corresponding target values.\n", + " \"\"\"\n", + "\n", + "\n", + "def make_lstm_sequences(df, steps):\n", + " X, y = [], []\n", + " values = df.values # convert DataFrame to NumPy array\n", + "\n", + " for i in range(len(values) - steps):\n", + " # Take the past `steps` rows of all columns except the last (features)\n", + " X.append(values[i:i+steps, :-1])\n", + "\n", + " # Take the target value at the next time step (last column)\n", + " y.append(values[i+steps, -1])\n", + "\n", + " return np.array(X), np.array(y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6714067e", + "metadata": {}, + "outputs": [], + "source": [ + " \"\"\"\n", + " Splits your sequence data into train, validation, and test sets.\n", + "\n", + " Inputs:\n", + " - X, y: arrays from make_lstm_sequences()\n", + " - train_ratio: % of data to use for training (default 70%)\n", + " - val_ratio: % of data to use for validation (default 15%)\n", + " → the rest (15%) automatically becomes test data.\n", + "\n", + " Outputs:\n", + " - X_train, y_train\n", + " - X_val, y_val\n", + " - X_test, y_test\n", + " \"\"\"\n", + " \n", + "def split_lstm_data(X, y, train_ratio=0.7, val_ratio=0.15):\n", + "\n", + "\n", + " n = len(X)\n", + " train_end = int(n * train_ratio)\n", + " val_end = int(n * (train_ratio + val_ratio))\n", + "\n", + " X_train, y_train = X[:train_end], y[:train_end]\n", + " X_val, y_val = X[train_end:val_end], y[train_end:val_end]\n", + " X_test, y_test = X[val_end:], y[val_end:]\n", + "\n", + " return X_train, y_train, X_val, y_val, X_test, y_test" + ] + }, + { + "cell_type": "markdown", + "id": "8883303e2711cfa", + "metadata": {}, + "source": [ + "3. Model set up\n", + " - Define LSTM architecture\n", + " - Initialize model, optimizer, and loss function\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6e20ffcba1ebc7c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "40be92eea5ae33e9", + "metadata": {}, + "source": [ + "4. Training Loop\n", + " - Train model over epochs\n", + " - Track loss and accuracy\n", + " - Save best model (optional)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85429d2f7577e9b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "4f0e8f82bc3d5508", + "metadata": {}, + "source": [ + "5. Evaluation & Results\n", + " - Evaluate on test set\n", + " - Plot training/validation loss curves\n", + " - Visualize predicted vs actual delays\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Analysis/Stinger_LSTM.ipynb b/Analysis/Stinger_LSTM.ipynb index b14d4bc12..e3ff40a09 100644 --- a/Analysis/Stinger_LSTM.ipynb +++ b/Analysis/Stinger_LSTM.ipynb @@ -4,10 +4,13 @@ "cell_type": "markdown", "id": "8a77807f92f26ee", "metadata": {}, - "source": "1. Setup + Imports" + "source": [ + "1. Setup + Imports" + ] }, { "cell_type": "code", + "execution_count": 3, "id": "f0f09687eb0be6fa", "metadata": { "ExecuteTime": { @@ -15,41 +18,36 @@ "start_time": "2025-11-01T22:47:31.481782Z" } }, + "outputs": [], "source": [ "import os\n", "import tensorflow as tf\n", "import pandas as pd\n", "import numpy as np" - ], - "outputs": [], - "execution_count": 3 + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "8a4683df52874161", + "metadata": {}, "source": [ "2. Data Preparation\n", " - loading dataset\n", " - preprocessing\n", " - creating sequences\n", " - splitting data" - ], - "id": "8a4683df52874161" + ] }, { + "cell_type": "code", + "execution_count": 11, + "id": "ac5ec41278b7be4e", "metadata": { "ExecuteTime": { "end_time": "2025-11-01T22:58:20.940274Z", "start_time": "2025-11-01T22:58:20.891444Z" } }, - "cell_type": "code", - "source": [ - "df = pd.read_csv(\"data/train_delay_data.csv\")\n", - "print(df.shape)\n", - "df.head()\n" - ], - "id": "ac5ec41278b7be4e", "outputs": [ { "name": "stdout", @@ -60,21 +58,6 @@ }, { "data": { - "text/plain": [ - " Distance Between Stations (km) Weather Conditions Day of the Week \\\n", - "0 100 Clear Monday \n", - "1 150 Rainy Tuesday \n", - "2 200 Foggy Wednesday \n", - "3 50 Clear Thursday \n", - "4 75 Rainy Friday \n", - "\n", - " Time of Day Train Type Historical Delay (min) Route Congestion \n", - "0 Morning Express 5 Low \n", - "1 Afternoon Superfast 10 Medium \n", - "2 Evening Local 15 High \n", - "3 Night Express 2 Low \n", - "4 Morning Superfast 8 Medium " - ], "text/html": [ "
\n", "