Skip to content

Commit

Permalink
Feat/refactor create time series (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
L-M-Sherlock authored Jul 29, 2023
1 parent 4763435 commit 420a10f
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 69 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "FSRS-Optimizer"
version = "4.4.2"
version = "4.5.0"
readme = "README.md"
dependencies = [
"matplotlib>=3.7.0",
Expand Down
8 changes: 5 additions & 3 deletions src/fsrs_optimizer/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,14 @@ def remembered_fallback_prompt(key: str, pretty: str = None):
show_graphs = graphs_input != "n"

optimizer = fsrs_optimizer.Optimizer()
optimizer.anki_extract(filename)
optimizer.anki_extract(
filename,
remembered_fallbacks["filter_out_suspended_cards"] == "y"
)
analysis = optimizer.create_time_series(
remembered_fallbacks["timezone"],
remembered_fallbacks["revlog_start_date"],
remembered_fallbacks["next_day"],
remembered_fallbacks["filter_out_suspended_cards"] == "y"
remembered_fallbacks["next_day"]
)
print(analysis)

Expand Down
133 changes: 68 additions & 65 deletions src/fsrs_optimizer/fsrs_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

New = 0
Learning = 1
Review = 2
Relearning = 3

def power_forgetting_curve(t, s):
return (1 + t / (9 * s)) ** -1

Expand Down Expand Up @@ -310,16 +315,13 @@ class Optimizer:
def __init__(self) -> None:
tqdm.pandas()

@staticmethod
def anki_extract(filename: str):
def anki_extract(self, filename: str, filter_out_suspended_cards: bool = False):
"""Step 1"""
# Extract the collection file or deck file to get the .anki21 database.
with zipfile.ZipFile(f'{filename}', 'r') as zip_ref:
zip_ref.extractall('./')
tqdm.write("Deck file extracted successfully!")

def create_time_series(self, timezone: str, revlog_start_date: str, next_day_starts_at: int, filter_out_suspended_cards: bool = False):
"""Step 2"""
if os.path.isfile("collection.anki21b"):
os.remove("collection.anki21b")
raise Exception(
Expand All @@ -346,60 +348,61 @@ def create_time_series(self, timezone: str, revlog_start_date: str, next_day_sta
if len(revlog) == 0:
raise Exception("No review log found!")
df = pd.DataFrame(revlog)
df.columns = ['id', 'cid', 'usn', 'r', 'ivl', 'last_ivl', 'factor', 'time', 'type']
df = df[(df['cid'] <= time.time() * 1000) &
(df['id'] <= time.time() * 1000)].copy()

df_set_due_date = df[(df['type'] == 4) & (df['ivl'] > 0)]
df.columns = ['review_time', 'card_id', 'usn', 'review_rating', 'ivl', 'last_ivl', 'factor', 'review_duration', 'review_state']
df = df[(df['card_id'] <= time.time() * 1000) &
(df['review_time'] <= time.time() * 1000)].copy()
df_set_due_date = df[(df['review_state'] == 4) & (df['ivl'] > 0)]
df.drop(df_set_due_date.index, inplace=True)
df.sort_values(by=['card_id', 'review_time'], inplace=True, ignore_index=True)

df['create_date'] = pd.to_datetime(df['cid'] // 1000, unit='s')
df['create_date'] = df['create_date'].dt.tz_localize('UTC').dt.tz_convert(timezone)
df['review_date'] = pd.to_datetime(df['id'] // 1000, unit='s')
df['review_date'] = df['review_date'].dt.tz_localize('UTC').dt.tz_convert(timezone)
df.drop(df[df['review_date'].dt.year < 2006].index, inplace=True)
df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)

df['is_learn_start'] = (df['type'] == 0) & (df['type'].shift() != 0)
df['is_learn_start'] = (df['review_state'] == 0) & (df['review_state'].shift() != 0)
df['sequence_group'] = df['is_learn_start'].cumsum()
last_learn_start = df[df['is_learn_start']].groupby('cid')['sequence_group'].last()
df['last_learn_start'] = df['cid'].map(last_learn_start).fillna(0).astype(int)
last_learn_start = df[df['is_learn_start']].groupby('card_id')['sequence_group'].last()
df['last_learn_start'] = df['card_id'].map(last_learn_start).fillna(0).astype(int)
df['mask'] = df['last_learn_start'] <= df['sequence_group']
df = df[df['mask'] == True].copy()
df.drop(columns=['is_learn_start', 'sequence_group', 'last_learn_start', 'mask'], inplace=True)
df = df[(df['type'] != 4)].copy()

self.type_sequence = np.array(df['type'])
self.time_sequence = np.array(df['time'])
df = df[(df['review_state'] != 4)].copy()
df = df[(df['review_state'] != 3) | (df['factor'] != 0)].copy()
df['review_state'] = df['review_state'] + 1
df.loc[df['is_learn_start'], 'review_state'] = New
df.drop(columns=['is_learn_start', 'sequence_group', 'last_learn_start', 'mask', 'usn', 'ivl', 'last_ivl', 'factor'], inplace=True)
df.to_csv("revlog.csv", index=False)
tqdm.write("revlog.csv saved.")

df = df[(df['type'] != 3) | (df['factor'] != 0)].copy()
def create_time_series(self, timezone: str, revlog_start_date: str, next_day_starts_at: int):
"""Step 2"""
df = pd.read_csv("./revlog.csv")
df['review_state'] = df['review_state'].map(lambda x: x if x != New else Learning)
self.state_sequence = np.array(df['review_state'])
self.duration_sequence = np.array(df['review_duration'])
df['review_date'] = pd.to_datetime(df['review_time'] // 1000, unit='s')
df['review_date'] = df['review_date'].dt.tz_localize('UTC').dt.tz_convert(timezone)
df.drop(df[df['review_date'].dt.year < 2006].index, inplace=True)
df['real_days'] = df['review_date'] - timedelta(hours=int(next_day_starts_at))
df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D', ambiguous='infer', nonexistent='shift_forward')).to_julian_date()
df.drop_duplicates(['cid', 'real_days'], keep='first', inplace=True)
df.drop_duplicates(['card_id', 'real_days'], keep='first', inplace=True)
df['delta_t'] = df.real_days.diff()
df.dropna(inplace=True)
df['i'] = df.groupby('cid').cumcount() + 1
df['i'] = df.groupby('card_id').cumcount() + 1
df.loc[df['i'] == 1, 'delta_t'] = 0
df = df.groupby('cid').filter(lambda group: group['type'].iloc[0] == 0)
df['prev_type'] = df.groupby('cid')['type'].shift(1).fillna(0).astype(int)
df['helper'] = ((df['type'] == 0) & ((df['prev_type'] == 1) | (df['prev_type'] == 2)) & (df['i'] > 1)).astype(int)
df['helper'] = df.groupby('cid')['helper'].cumsum()
df = df.groupby('card_id').filter(lambda group: group['review_state'].iloc[0] == Learning)
df['prev_review_state'] = df.groupby('card_id')['review_state'].shift(1).fillna(Learning).astype(int)
df['helper'] = ((df['review_state'] == Learning) & ((df['prev_review_state'] == Review) | (df['prev_review_state'] == Relearning)) & (df['i'] > 1)).astype(int)
df['helper'] = df.groupby('card_id')['helper'].cumsum()
df = df[df['helper'] == 0]
del df['prev_type']
del df['prev_review_state']
del df['helper']

def cum_concat(x):
return list(accumulate(x))

t_history = df.groupby('cid', group_keys=False)['delta_t'].apply(lambda x: cum_concat([[int(i)] for i in x]))
t_history = df.groupby('card_id', group_keys=False)['delta_t'].apply(lambda x: cum_concat([[int(i)] for i in x]))
df['t_history']=[','.join(map(str, item[:-1])) for sublist in t_history for item in sublist]
r_history = df.groupby('cid', group_keys=False)['r'].apply(lambda x: cum_concat([[i] for i in x]))
r_history = df.groupby('card_id', group_keys=False)['review_rating'].apply(lambda x: cum_concat([[i] for i in x]))
df['r_history']=[','.join(map(str, item[:-1])) for sublist in r_history for item in sublist]
df = df.groupby('cid').filter(lambda group: group['id'].min() > time.mktime(datetime.strptime(revlog_start_date, "%Y-%m-%d").timetuple()) * 1000)
df = df[df['r'] != 0].copy()
df['y'] = df['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x])
df = df.groupby('card_id').filter(lambda group: group['review_time'].min() > time.mktime(datetime.strptime(revlog_start_date, "%Y-%m-%d").timetuple()) * 1000)
df = df[df['review_rating'] != 0].copy()
df['y'] = df['review_rating'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x])

def remove_outliers(group: pd.DataFrame) -> pd.DataFrame:
# threshold = np.mean(group['delta_t']) * 1.5
Expand All @@ -421,7 +424,7 @@ def remove_non_continuous_rows(group):
first_non_continuous_index = discontinuity.idxmax()
return group.loc[:first_non_continuous_index-1]

df = df.groupby('cid', as_index=False, group_keys=False).progress_apply(remove_non_continuous_rows)
df = df.groupby('card_id', as_index=False, group_keys=False).progress_apply(remove_non_continuous_rows)

df.to_csv('revlog_history.tsv', sep="\t", index=False)
tqdm.write("Trainset saved.")
Expand All @@ -431,10 +434,10 @@ def remove_non_continuous_rows(group):
self.S0_dataset_group.to_csv('stability_for_pretrain.tsv', sep='\t', index=None)

df['retention'] = df.groupby(by=['r_history', 'delta_t'], group_keys=False)['y'].transform('mean')
df['total_cnt'] = df.groupby(by=['r_history', 'delta_t'], group_keys=False)['id'].transform('count')
df['total_cnt'] = df.groupby(by=['r_history', 'delta_t'], group_keys=False)['review_time'].transform('count')
tqdm.write("Retention calculated.")

df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_ivl', 'factor', 'time', 'type', 'create_date', 'review_date', 'real_days', 'r', 't_history', 'y'])
df.drop(columns=['review_time', 'card_id', 'review_duration', 'review_state', 'review_date', 'real_days', 'review_rating', 't_history', 'y'], inplace=True)
df.drop_duplicates(inplace=True)
df['retention'] = df['retention'].map(lambda x: max(min(0.99, x), 0.01))

Expand Down Expand Up @@ -616,7 +619,7 @@ def train(self, lr: float = 4e-2, n_epoch: int = 5, n_splits: int = 5, batch_siz
tqdm.write("\nTraining finished!")
return plots

def preview(self, requestRetention: float):
def preview(self, requestRetention: float, verbose=False):
my_collection = Collection(self.w)
preview_text = "1:again, 2:hard, 3:good, 4:easy\n"
for first_rating in (1,2,3,4):
Expand All @@ -627,8 +630,8 @@ def preview(self, requestRetention: float):
# print("stability, difficulty, lapses")
for i in range(10):
states = my_collection.predict(t_history, r_history)
# print('{0:9.2f} {1:11.2f} {2:7.0f}'.format(
# *list(map(lambda x: round(float(x), 4), states))))
if verbose:
print('{0:9.2f} {1:11.2f} {2:7.0f}'.format(*list(map(lambda x: round(float(x), 4), states))))
next_t = next_interval(states[0], requestRetention)
difficulty = round(float(states[1]), 1)
t_history += f',{int(next_t)}'
Expand Down Expand Up @@ -664,10 +667,10 @@ def predict_memory_states(self):
difficulties = map(lambda x: round(x, 2), difficulties)
self.dataset['stability'] = list(stabilities)
self.dataset['difficulty'] = list(difficulties)
prediction = self.dataset.groupby(by=['t_history', 'r_history']).agg({"stability": "mean", "difficulty": "mean", "id": "count"})
prediction = self.dataset.groupby(by=['t_history', 'r_history']).agg({"stability": "mean", "difficulty": "mean", "review_time": "count"})
prediction.reset_index(inplace=True)
prediction.sort_values(by=['r_history'], inplace=True)
prediction.rename(columns={"id": "count"}, inplace=True)
prediction.rename(columns={"review_time": "count"}, inplace=True)
prediction.to_csv("./prediction.tsv", sep='\t', index=None)
tqdm.write("prediction.tsv saved.")
prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
Expand All @@ -690,24 +693,24 @@ def find_optimal_retention(self):
f_time = 25
max_time = 1e10

type_block = dict()
type_count = dict()
type_time = dict()
last_t = self.type_sequence[0]
type_block[last_t] = 1
type_count[last_t] = 1
type_time[last_t] = self.time_sequence[0]
for i,t in enumerate(self.type_sequence[1:]):
type_count[t] = type_count.setdefault(t, 0) + 1
type_time[t] = type_time.setdefault(t, 0) + self.time_sequence[i]
if t != last_t:
type_block[t] = type_block.setdefault(t, 0) + 1
last_t = t

r_time = round(type_time[1]/type_count[1]/1000, 1)

if 2 in type_count and 2 in type_block:
f_time = round(type_time[2]/type_block[2]/1000 + r_time, 1)
state_block = dict()
state_count = dict()
state_duration = dict()
last_state = self.state_sequence[0]
state_block[last_state] = 1
state_count[last_state] = 1
state_duration[last_state] = self.duration_sequence[0]
for i, state in enumerate(self.state_sequence[1:]):
state_count[state] = state_count.setdefault(state, 0) + 1
state_duration[state] = state_duration.setdefault(state, 0) + self.duration_sequence[i]
if state != last_state:
state_block[state] = state_block.setdefault(state, 0) + 1
last_state = state

r_time = round(state_duration[Review]/state_count[Review]/1000, 1)

if Relearning in state_count and Relearning in state_block:
f_time = round(state_duration[Relearning]/state_block[Relearning]/1000 + r_time, 1)

tqdm.write(f"average time for failed cards: {f_time}s")
tqdm.write(f"average time for recalled cards: {r_time}s")
Expand Down Expand Up @@ -806,8 +809,8 @@ def evaluate(self):
tmp['difficulty'] = tmp['difficulty'].map(lambda x: round(x, 2))
tmp['p'] = tmp['p'].map(lambda x: round(x, 2))
tmp['log_loss'] = tmp['log_loss'].map(lambda x: round(x, 2))
tmp.rename(columns={"r": "grade", "p": "retrievability"}, inplace=True)
tmp[['id', 'cid', 'review_date', 'r_history', 't_history', 'delta_t', 'grade', 'stability', 'difficulty', 'retrievability', 'log_loss']].to_csv("./evaluation.tsv", sep='\t', index=False)
tmp.rename(columns={"p": "retrievability"}, inplace=True)
tmp[['review_time', 'card_id', 'review_date', 'r_history', 't_history', 'delta_t', 'review_rating', 'stability', 'difficulty', 'retrievability', 'log_loss']].to_csv("./evaluation.tsv", sep='\t', index=False)
del tmp
return loss_before, loss_after

Expand Down

0 comments on commit 420a10f

Please sign in to comment.