TheDataShed
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎dates.py
-141 b/‎dates.py
-141
diff --git a/‎python/__init__.py b/‎python/__init__.py
diff --git a/‎python/generate_events.py
+138 b/‎python/generate_events.py
+138
diff --git a/‎python/group_events.py
+107 b/‎python/group_events.py
+107
@@ -1,3 +1,5 @@
+data/
+
 # Created by https://www.toptal.com/developers/gitignore/api/macos,python,visualstudiocode,pycharm
 # Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,visualstudiocode,pycharm
 
 
@@ -0,0 +1,138 @@
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+from datetime import (
+    datetime,
+    timedelta,
+)
+import os
+import random
+from typing import List
+
+from matplotlib import pyplot as plt
+import pandas as pd
+
+
+NUM_USERS = 4
+
+WEBSITES = ["Cat Pictures", "Stack Overflow", "PyData", "Reddit"]
+NUM_WEBSITES = len(WEBSITES)
+
+COLOURS = [
+    "#f98c2a",  # Orange
+    "#5ddbc2",  # Teal
+    "#5c5aa6",  # Dark purple
+    "#d8d7d6",  # Darker grey
+    "#8c8bc0",  # Light purple
+    "#e2e2e3",  # Grey
+]
+
+random.seed("PyData!")
+
+
+@dataclass
+class Event:
+    user_id: int
+    website_id: int
+    event_datetime: datetime
+
+
+Events = List[Event]
+
+
+def random_dates(start: datetime, l: int) -> List[datetime]:
+    current = start
+    for _ in range(l):
+        # Add a gap of up to 5 minutes
+        extra_minutes = random.randint(0, 5)
+
+        # 1/10 times add a bigger gap of up to an hour
+        if random.random() < 0.1:
+            extra_minutes += random.randint(10, 60)
+
+        current += timedelta(minutes=extra_minutes)
+        current += timedelta(seconds=random.randint(0, 59))
+
+        yield current
+
+
+def random_events() -> pd.DataFrame:
+    events: Events = []
+
+    for cust_id in range(NUM_USERS):
+        num_rows = random.randint(20, 40)
+        website_ids = list(range(NUM_WEBSITES))
+        for dt in random_dates(datetime(2022, 10, 1, 8), num_rows):
+            event = Event(
+                user_id=cust_id,
+                website_id=random.choices(
+                    population=website_ids, weights=website_ids.reverse()
+                )[0],
+                event_datetime=dt,
+            )
+            events.append(event)
+
+    return pd.DataFrame.from_records([asdict(e) for e in events])
+
+
+def plot_events(events_df: pd.DataFrame):
+    # Different colours for each website
+    # create an axis with the first website, then append to that axis afterwards
+    cust_df = events_df[events_df["website_id"] == 0]
+    axis = cust_df.plot(
+        x="event_datetime",
+        y="user_id",
+        kind="scatter",
+        c=COLOURS[0],
+        label=WEBSITES[0],
+    )
+
+    for website_id in range(1, NUM_WEBSITES):
+        cust_df = events_df[events_df["website_id"] == website_id]
+        cust_df.plot(
+            x="event_datetime",
+            y="user_id",
+            kind="scatter",
+            ax=axis,
+            c=COLOURS[website_id],
+            label=WEBSITES[website_id],
+        )
+
+    min_date = (
+        events_df["event_datetime"].min().replace(microsecond=0, second=0, minute=0)
+    )
+    max_date = events_df["event_datetime"].max().replace(
+        microsecond=0, second=0, minute=0
+    ) + timedelta(hours=1)
+
+    date_to_add = min_date
+    dates = [date_to_add]
+    while date_to_add <= max_date:
+        date_to_add += timedelta(hours=2)
+        dates.append(date_to_add)
+
+    plt.yticks(range(NUM_USERS))
+    plt.xticks(dates)
+
+    plt.ylabel("user ID", fontsize=8)
+    plt.xlabel("Time", fontsize=8)
+
+    plt.tick_params(axis="x", which="major", labelsize=8)
+    plt.tick_params(axis="y", which="major", labelsize=8)
+
+    plt.show()
+
+
+if __name__ == "__main__":
+    events = random_events()
+
+    base_dir = os.path.dirname(os.path.dirname(__file__))
+    filename = f"website_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+    events.to_csv(
+        os.path.join(base_dir, "data", filename),
+        index=False,
+        date_format="%Y-%m-%dT%H:%M:%S",
+    )
+
+    plot_events(events)
@@ -0,0 +1,107 @@
+import csv
+from datetime import datetime, timedelta
+import hashlib
+import os
+
+from matplotlib import pyplot as plt
+import pandas as pd
+
+
+SESSION_TIMEOUT_SECS = 1800
+
+CSV_FILE = "website_data_20230418_012558.csv"
+
+base_dir = os.path.dirname(os.path.dirname(__file__))
+file_path = os.path.join(base_dir, "data", CSV_FILE)
+
+
+def read_events() -> list[str]:
+    with open(file_path) as f:
+        csv_reader = csv.DictReader(f)
+        return list(csv_reader)
+
+
+def sequential_grouping(events: list[str]) -> list[dict]:
+    current_user = None
+    for event in events:
+        current_website = event["website_id"]
+        current_time = datetime.fromisoformat(event["event_datetime"])
+
+        if current_user != event["user_id"]:
+            current_user = event["user_id"]
+            sessions = {}
+
+        # Is this a new website or has the session timed out?
+        if current_website not in sessions or current_time >= sessions.get(
+            current_website
+        ).get("max_session_time"):
+            hash_string = f"{current_user}-{current_website}-{event['event_datetime']}"
+            # create a new session in the 'memory'
+            sessions[current_website] = {
+                "session_id": hashlib.md5(hash_string.encode("utf-8")).hexdigest(),
+            }
+
+        # update the max_session_time for the session
+        sessions[current_website]["max_session_time"] = current_time + timedelta(
+            seconds=SESSION_TIMEOUT_SECS
+        )
+
+        # Output the row with session information
+        session_id = sessions.get(current_website).get("session_id")
+        yield {
+            "user_id": current_user,
+            "website_id": current_website,
+            "event_datetime": datetime.fromisoformat(event["event_datetime"]),
+            "session_id": session_id,
+        }
+
+
+def plot_events(events_df: pd.DataFrame):
+    # Different colours for each session_id
+    # get all the session_ids
+    session_ids = events_df["session_id"].unique()
+
+    # create an axis with the first website, then append to that axis afterwards
+    cust_df = events_df[events_df["session_id"] == session_ids[0]]
+    axis = cust_df.plot(
+        x="event_datetime",
+        y="user_id",
+        kind="scatter",
+        c=f"#{session_ids[0][:6]}",
+        label=session_ids[0][:6],
+    )
+
+    for session_id in session_ids:
+        cust_df = events_df[events_df["session_id"] == session_id]
+        cust_df.plot(
+            x="event_datetime",
+            y="user_id",
+            kind="scatter",
+            ax=axis,
+            c=f"#{session_id[:6]}",
+            label=session_id[:6],
+        )
+
+    num_users = events_df["user_id"].unique().size
+
+    plt.yticks(range(num_users))
+
+    plt.gcf().autofmt_xdate()
+
+    plt.ylabel("user ID", fontsize=8)
+    plt.xlabel("Time", fontsize=8)
+
+    plt.tick_params(axis="x", which="major", labelsize=8)
+    plt.tick_params(axis="y", which="major", labelsize=8)
+
+    axis.get_legend().remove()
+
+    plt.show()
+
+
+if __name__ == "__main__":
+    events = read_events()
+
+    events_df = pd.DataFrame.from_records(sequential_grouping(events))
+
+    plot_events(events_df)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+data/`
	`2`	`+`
`1`	`3`	`# Created by https://www.toptal.com/developers/gitignore/api/macos,python,visualstudiocode,pycharm`
`2`	`4`	`# Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,visualstudiocode,pycharm`
`3`	`5`