Skip to content

Commit f35594c

Browse files
author
Will Johnson
committed
Add example grouping logic in group_events.py
1 parent beb5e4d commit f35594c

File tree

8 files changed

+250
-258
lines changed

8 files changed

+250
-258
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
data/
2+
13
# Created by https://www.toptal.com/developers/gitignore/api/macos,python,visualstudiocode,pycharm
24
# Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,visualstudiocode,pycharm
35

dates.py

-141
This file was deleted.

python/__init__.py

Whitespace-only changes.

python/generate_events.py

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from dataclasses import (
2+
asdict,
3+
dataclass,
4+
)
5+
from datetime import (
6+
datetime,
7+
timedelta,
8+
)
9+
import os
10+
import random
11+
from typing import List
12+
13+
from matplotlib import pyplot as plt
14+
import pandas as pd
15+
16+
17+
NUM_USERS = 4
18+
19+
WEBSITES = ["Cat Pictures", "Stack Overflow", "PyData", "Reddit"]
20+
NUM_WEBSITES = len(WEBSITES)
21+
22+
COLOURS = [
23+
"#f98c2a", # Orange
24+
"#5ddbc2", # Teal
25+
"#5c5aa6", # Dark purple
26+
"#d8d7d6", # Darker grey
27+
"#8c8bc0", # Light purple
28+
"#e2e2e3", # Grey
29+
]
30+
31+
random.seed("PyData!")
32+
33+
34+
@dataclass
35+
class Event:
36+
user_id: int
37+
website_id: int
38+
event_datetime: datetime
39+
40+
41+
Events = List[Event]
42+
43+
44+
def random_dates(start: datetime, l: int) -> List[datetime]:
45+
current = start
46+
for _ in range(l):
47+
# Add a gap of up to 5 minutes
48+
extra_minutes = random.randint(0, 5)
49+
50+
# 1/10 times add a bigger gap of up to an hour
51+
if random.random() < 0.1:
52+
extra_minutes += random.randint(10, 60)
53+
54+
current += timedelta(minutes=extra_minutes)
55+
current += timedelta(seconds=random.randint(0, 59))
56+
57+
yield current
58+
59+
60+
def random_events() -> pd.DataFrame:
61+
events: Events = []
62+
63+
for cust_id in range(NUM_USERS):
64+
num_rows = random.randint(20, 40)
65+
website_ids = list(range(NUM_WEBSITES))
66+
for dt in random_dates(datetime(2022, 10, 1, 8), num_rows):
67+
event = Event(
68+
user_id=cust_id,
69+
website_id=random.choices(
70+
population=website_ids, weights=website_ids.reverse()
71+
)[0],
72+
event_datetime=dt,
73+
)
74+
events.append(event)
75+
76+
return pd.DataFrame.from_records([asdict(e) for e in events])
77+
78+
79+
def plot_events(events_df: pd.DataFrame):
80+
# Different colours for each website
81+
# create an axis with the first website, then append to that axis afterwards
82+
cust_df = events_df[events_df["website_id"] == 0]
83+
axis = cust_df.plot(
84+
x="event_datetime",
85+
y="user_id",
86+
kind="scatter",
87+
c=COLOURS[0],
88+
label=WEBSITES[0],
89+
)
90+
91+
for website_id in range(1, NUM_WEBSITES):
92+
cust_df = events_df[events_df["website_id"] == website_id]
93+
cust_df.plot(
94+
x="event_datetime",
95+
y="user_id",
96+
kind="scatter",
97+
ax=axis,
98+
c=COLOURS[website_id],
99+
label=WEBSITES[website_id],
100+
)
101+
102+
min_date = (
103+
events_df["event_datetime"].min().replace(microsecond=0, second=0, minute=0)
104+
)
105+
max_date = events_df["event_datetime"].max().replace(
106+
microsecond=0, second=0, minute=0
107+
) + timedelta(hours=1)
108+
109+
date_to_add = min_date
110+
dates = [date_to_add]
111+
while date_to_add <= max_date:
112+
date_to_add += timedelta(hours=2)
113+
dates.append(date_to_add)
114+
115+
plt.yticks(range(NUM_USERS))
116+
plt.xticks(dates)
117+
118+
plt.ylabel("user ID", fontsize=8)
119+
plt.xlabel("Time", fontsize=8)
120+
121+
plt.tick_params(axis="x", which="major", labelsize=8)
122+
plt.tick_params(axis="y", which="major", labelsize=8)
123+
124+
plt.show()
125+
126+
127+
if __name__ == "__main__":
128+
events = random_events()
129+
130+
base_dir = os.path.dirname(os.path.dirname(__file__))
131+
filename = f"website_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
132+
events.to_csv(
133+
os.path.join(base_dir, "data", filename),
134+
index=False,
135+
date_format="%Y-%m-%dT%H:%M:%S",
136+
)
137+
138+
plot_events(events)

python/group_events.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import csv
2+
from datetime import datetime, timedelta
3+
import hashlib
4+
import os
5+
6+
from matplotlib import pyplot as plt
7+
import pandas as pd
8+
9+
10+
SESSION_TIMEOUT_SECS = 1800
11+
12+
CSV_FILE = "website_data_20230418_012558.csv"
13+
14+
base_dir = os.path.dirname(os.path.dirname(__file__))
15+
file_path = os.path.join(base_dir, "data", CSV_FILE)
16+
17+
18+
def read_events() -> list[str]:
19+
with open(file_path) as f:
20+
csv_reader = csv.DictReader(f)
21+
return list(csv_reader)
22+
23+
24+
def sequential_grouping(events: list[str]) -> list[dict]:
25+
current_user = None
26+
for event in events:
27+
current_website = event["website_id"]
28+
current_time = datetime.fromisoformat(event["event_datetime"])
29+
30+
if current_user != event["user_id"]:
31+
current_user = event["user_id"]
32+
sessions = {}
33+
34+
# Is this a new website or has the session timed out?
35+
if current_website not in sessions or current_time >= sessions.get(
36+
current_website
37+
).get("max_session_time"):
38+
hash_string = f"{current_user}-{current_website}-{event['event_datetime']}"
39+
# create a new session in the 'memory'
40+
sessions[current_website] = {
41+
"session_id": hashlib.md5(hash_string.encode("utf-8")).hexdigest(),
42+
}
43+
44+
# update the max_session_time for the session
45+
sessions[current_website]["max_session_time"] = current_time + timedelta(
46+
seconds=SESSION_TIMEOUT_SECS
47+
)
48+
49+
# Output the row with session information
50+
session_id = sessions.get(current_website).get("session_id")
51+
yield {
52+
"user_id": current_user,
53+
"website_id": current_website,
54+
"event_datetime": datetime.fromisoformat(event["event_datetime"]),
55+
"session_id": session_id,
56+
}
57+
58+
59+
def plot_events(events_df: pd.DataFrame):
60+
# Different colours for each session_id
61+
# get all the session_ids
62+
session_ids = events_df["session_id"].unique()
63+
64+
# create an axis with the first website, then append to that axis afterwards
65+
cust_df = events_df[events_df["session_id"] == session_ids[0]]
66+
axis = cust_df.plot(
67+
x="event_datetime",
68+
y="user_id",
69+
kind="scatter",
70+
c=f"#{session_ids[0][:6]}",
71+
label=session_ids[0][:6],
72+
)
73+
74+
for session_id in session_ids:
75+
cust_df = events_df[events_df["session_id"] == session_id]
76+
cust_df.plot(
77+
x="event_datetime",
78+
y="user_id",
79+
kind="scatter",
80+
ax=axis,
81+
c=f"#{session_id[:6]}",
82+
label=session_id[:6],
83+
)
84+
85+
num_users = events_df["user_id"].unique().size
86+
87+
plt.yticks(range(num_users))
88+
89+
plt.gcf().autofmt_xdate()
90+
91+
plt.ylabel("user ID", fontsize=8)
92+
plt.xlabel("Time", fontsize=8)
93+
94+
plt.tick_params(axis="x", which="major", labelsize=8)
95+
plt.tick_params(axis="y", which="major", labelsize=8)
96+
97+
axis.get_legend().remove()
98+
99+
plt.show()
100+
101+
102+
if __name__ == "__main__":
103+
events = read_events()
104+
105+
events_df = pd.DataFrame.from_records(sequential_grouping(events))
106+
107+
plot_events(events_df)

0 commit comments

Comments
 (0)