Feat/refactor create time series (#2)

open-spaced-repetition · Jul 29, 2023 · 420a10f · 420a10f
1 parent 4763435
commit 420a10f
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 69 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "FSRS-Optimizer"
-version = "4.4.2"
+version = "4.5.0"
 readme = "README.md"
 dependencies = [
     "matplotlib>=3.7.0",

diff --git a/src/fsrs_optimizer/__main__.py b/src/fsrs_optimizer/__main__.py
@@ -61,12 +61,14 @@ def remembered_fallback_prompt(key: str, pretty: str = None):
     show_graphs = graphs_input != "n"
 
     optimizer = fsrs_optimizer.Optimizer()
-    optimizer.anki_extract(filename)
+    optimizer.anki_extract(
+        filename,
+        remembered_fallbacks["filter_out_suspended_cards"] == "y"
+    )
     analysis = optimizer.create_time_series(
         remembered_fallbacks["timezone"],
         remembered_fallbacks["revlog_start_date"],
-        remembered_fallbacks["next_day"],
-        remembered_fallbacks["filter_out_suspended_cards"] == "y"
+        remembered_fallbacks["next_day"]
     )
     print(analysis)
 

diff --git a/src/fsrs_optimizer/fsrs_optimizer.py b/src/fsrs_optimizer/fsrs_optimizer.py
@@ -23,6 +23,11 @@
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
 
+New = 0
+Learning = 1
+Review = 2
+Relearning = 3
+
 def power_forgetting_curve(t, s):
     return (1 + t / (9 * s)) ** -1
 
@@ -310,16 +315,13 @@ class Optimizer:
     def __init__(self) -> None:
         tqdm.pandas()
 
-    @staticmethod
-    def anki_extract(filename: str):
+    def anki_extract(self, filename: str, filter_out_suspended_cards: bool = False):
         """Step 1"""
         # Extract the collection file or deck file to get the .anki21 database.
         with zipfile.ZipFile(f'{filename}', 'r') as zip_ref:
             zip_ref.extractall('./')
             tqdm.write("Deck file extracted successfully!")
 
-    def create_time_series(self, timezone: str, revlog_start_date: str, next_day_starts_at: int, filter_out_suspended_cards: bool = False):
-        """Step 2"""
         if os.path.isfile("collection.anki21b"):
             os.remove("collection.anki21b")
             raise Exception(
@@ -346,60 +348,61 @@ def create_time_series(self, timezone: str, revlog_start_date: str, next_day_sta
         if len(revlog) == 0:
             raise Exception("No review log found!")
         df = pd.DataFrame(revlog)
-        df.columns = ['id', 'cid', 'usn', 'r', 'ivl', 'last_ivl', 'factor', 'time', 'type']
-        df = df[(df['cid'] <= time.time() * 1000) &
-                (df['id'] <= time.time() * 1000)].copy()
-
-        df_set_due_date = df[(df['type'] == 4) & (df['ivl'] > 0)]
+        df.columns = ['review_time', 'card_id', 'usn', 'review_rating', 'ivl', 'last_ivl', 'factor', 'review_duration', 'review_state']
+        df = df[(df['card_id'] <= time.time() * 1000) &
+                (df['review_time'] <= time.time() * 1000)].copy()
+        df_set_due_date = df[(df['review_state'] == 4) & (df['ivl'] > 0)]
         df.drop(df_set_due_date.index, inplace=True)
+        df.sort_values(by=['card_id', 'review_time'], inplace=True, ignore_index=True)
 
-        df['create_date'] = pd.to_datetime(df['cid'] // 1000, unit='s')
-        df['create_date'] = df['create_date'].dt.tz_localize('UTC').dt.tz_convert(timezone)
-        df['review_date'] = pd.to_datetime(df['id'] // 1000, unit='s')
-        df['review_date'] = df['review_date'].dt.tz_localize('UTC').dt.tz_convert(timezone)
-        df.drop(df[df['review_date'].dt.year < 2006].index, inplace=True)
-        df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
-
-        df['is_learn_start'] = (df['type'] == 0) & (df['type'].shift() != 0)
+        df['is_learn_start'] = (df['review_state'] == 0) & (df['review_state'].shift() != 0)
         df['sequence_group'] = df['is_learn_start'].cumsum()
-        last_learn_start = df[df['is_learn_start']].groupby('cid')['sequence_group'].last()
-        df['last_learn_start'] = df['cid'].map(last_learn_start).fillna(0).astype(int)
+        last_learn_start = df[df['is_learn_start']].groupby('card_id')['sequence_group'].last()
+        df['last_learn_start'] = df['card_id'].map(last_learn_start).fillna(0).astype(int)
         df['mask'] = df['last_learn_start'] <= df['sequence_group']
         df = df[df['mask'] == True].copy()
-        df.drop(columns=['is_learn_start', 'sequence_group', 'last_learn_start', 'mask'], inplace=True)
-        df = df[(df['type'] != 4)].copy()
-
-        self.type_sequence = np.array(df['type'])
-        self.time_sequence = np.array(df['time'])
+        df = df[(df['review_state'] != 4)].copy()
+        df = df[(df['review_state'] != 3) | (df['factor'] != 0)].copy()
+        df['review_state'] = df['review_state'] + 1
+        df.loc[df['is_learn_start'], 'review_state'] = New
+        df.drop(columns=['is_learn_start', 'sequence_group', 'last_learn_start', 'mask', 'usn', 'ivl', 'last_ivl', 'factor'], inplace=True)
         df.to_csv("revlog.csv", index=False)
         tqdm.write("revlog.csv saved.")
 
-        df = df[(df['type'] != 3) | (df['factor'] != 0)].copy()
+    def create_time_series(self, timezone: str, revlog_start_date: str, next_day_starts_at: int):
+        """Step 2"""
+        df = pd.read_csv("./revlog.csv")
+        df['review_state'] = df['review_state'].map(lambda x: x if x != New else Learning)
+        self.state_sequence = np.array(df['review_state'])
+        self.duration_sequence = np.array(df['review_duration'])
+        df['review_date'] = pd.to_datetime(df['review_time'] // 1000, unit='s')
+        df['review_date'] = df['review_date'].dt.tz_localize('UTC').dt.tz_convert(timezone)
+        df.drop(df[df['review_date'].dt.year < 2006].index, inplace=True)
         df['real_days'] = df['review_date'] - timedelta(hours=int(next_day_starts_at))
         df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D', ambiguous='infer', nonexistent='shift_forward')).to_julian_date()
-        df.drop_duplicates(['cid', 'real_days'], keep='first', inplace=True)
+        df.drop_duplicates(['card_id', 'real_days'], keep='first', inplace=True)
         df['delta_t'] = df.real_days.diff()
         df.dropna(inplace=True)
-        df['i'] = df.groupby('cid').cumcount() + 1
+        df['i'] = df.groupby('card_id').cumcount() + 1
         df.loc[df['i'] == 1, 'delta_t'] = 0
-        df = df.groupby('cid').filter(lambda group: group['type'].iloc[0] == 0)
-        df['prev_type'] = df.groupby('cid')['type'].shift(1).fillna(0).astype(int)
-        df['helper'] = ((df['type'] == 0) & ((df['prev_type'] == 1) | (df['prev_type'] == 2)) & (df['i'] > 1)).astype(int)
-        df['helper'] = df.groupby('cid')['helper'].cumsum()
+        df = df.groupby('card_id').filter(lambda group: group['review_state'].iloc[0] == Learning)
+        df['prev_review_state'] = df.groupby('card_id')['review_state'].shift(1).fillna(Learning).astype(int)
+        df['helper'] = ((df['review_state'] == Learning) & ((df['prev_review_state'] == Review) | (df['prev_review_state'] == Relearning)) & (df['i'] > 1)).astype(int)
+        df['helper'] = df.groupby('card_id')['helper'].cumsum()
         df = df[df['helper'] == 0]
-        del df['prev_type']
+        del df['prev_review_state']
         del df['helper']
 
         def cum_concat(x):
             return list(accumulate(x))
 
-        t_history = df.groupby('cid', group_keys=False)['delta_t'].apply(lambda x: cum_concat([[int(i)] for i in x]))
+        t_history = df.groupby('card_id', group_keys=False)['delta_t'].apply(lambda x: cum_concat([[int(i)] for i in x]))
         df['t_history']=[','.join(map(str, item[:-1])) for sublist in t_history for item in sublist]
-        r_history = df.groupby('cid', group_keys=False)['r'].apply(lambda x: cum_concat([[i] for i in x]))
+        r_history = df.groupby('card_id', group_keys=False)['review_rating'].apply(lambda x: cum_concat([[i] for i in x]))
         df['r_history']=[','.join(map(str, item[:-1])) for sublist in r_history for item in sublist]
-        df = df.groupby('cid').filter(lambda group: group['id'].min() > time.mktime(datetime.strptime(revlog_start_date, "%Y-%m-%d").timetuple()) * 1000)
-        df = df[df['r'] != 0].copy()
-        df['y'] = df['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x])
+        df = df.groupby('card_id').filter(lambda group: group['review_time'].min() > time.mktime(datetime.strptime(revlog_start_date, "%Y-%m-%d").timetuple()) * 1000)
+        df = df[df['review_rating'] != 0].copy()
+        df['y'] = df['review_rating'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x])
 
         def remove_outliers(group: pd.DataFrame) -> pd.DataFrame:
             # threshold = np.mean(group['delta_t']) * 1.5
@@ -421,7 +424,7 @@ def remove_non_continuous_rows(group):
                 first_non_continuous_index = discontinuity.idxmax()
                 return group.loc[:first_non_continuous_index-1]
 
-        df = df.groupby('cid', as_index=False, group_keys=False).progress_apply(remove_non_continuous_rows)
+        df = df.groupby('card_id', as_index=False, group_keys=False).progress_apply(remove_non_continuous_rows)
 
         df.to_csv('revlog_history.tsv', sep="\t", index=False)
         tqdm.write("Trainset saved.")
@@ -431,10 +434,10 @@ def remove_non_continuous_rows(group):
         self.S0_dataset_group.to_csv('stability_for_pretrain.tsv', sep='\t', index=None)
 
         df['retention'] = df.groupby(by=['r_history', 'delta_t'], group_keys=False)['y'].transform('mean')
-        df['total_cnt'] = df.groupby(by=['r_history', 'delta_t'], group_keys=False)['id'].transform('count')
+        df['total_cnt'] = df.groupby(by=['r_history', 'delta_t'], group_keys=False)['review_time'].transform('count')
         tqdm.write("Retention calculated.")
 
-        df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_ivl', 'factor', 'time', 'type', 'create_date', 'review_date', 'real_days', 'r', 't_history', 'y'])
+        df.drop(columns=['review_time', 'card_id', 'review_duration', 'review_state', 'review_date', 'real_days', 'review_rating', 't_history', 'y'], inplace=True)
         df.drop_duplicates(inplace=True)
         df['retention'] = df['retention'].map(lambda x: max(min(0.99, x), 0.01))
 
@@ -616,7 +619,7 @@ def train(self, lr: float = 4e-2, n_epoch: int = 5, n_splits: int = 5, batch_siz
         tqdm.write("\nTraining finished!")
         return plots
 
-    def preview(self, requestRetention: float):
+    def preview(self, requestRetention: float, verbose=False):
         my_collection = Collection(self.w)
         preview_text = "1:again, 2:hard, 3:good, 4:easy\n"
         for first_rating in (1,2,3,4):
@@ -627,8 +630,8 @@ def preview(self, requestRetention: float):
             # print("stability, difficulty, lapses")
             for i in range(10):
                 states = my_collection.predict(t_history, r_history)
-                # print('{0:9.2f} {1:11.2f} {2:7.0f}'.format(
-                    # *list(map(lambda x: round(float(x), 4), states))))
+                if verbose:
+                    print('{0:9.2f} {1:11.2f} {2:7.0f}'.format(*list(map(lambda x: round(float(x), 4), states))))
                 next_t = next_interval(states[0], requestRetention)                
                 difficulty = round(float(states[1]), 1)
                 t_history += f',{int(next_t)}'
@@ -664,10 +667,10 @@ def predict_memory_states(self):
         difficulties = map(lambda x: round(x, 2), difficulties)
         self.dataset['stability'] = list(stabilities)
         self.dataset['difficulty'] = list(difficulties)
-        prediction = self.dataset.groupby(by=['t_history', 'r_history']).agg({"stability": "mean", "difficulty": "mean", "id": "count"})
+        prediction = self.dataset.groupby(by=['t_history', 'r_history']).agg({"stability": "mean", "difficulty": "mean", "review_time": "count"})
         prediction.reset_index(inplace=True)
         prediction.sort_values(by=['r_history'], inplace=True)
-        prediction.rename(columns={"id": "count"}, inplace=True)
+        prediction.rename(columns={"review_time": "count"}, inplace=True)
         prediction.to_csv("./prediction.tsv", sep='\t', index=None)
         tqdm.write("prediction.tsv saved.")
         prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
@@ -690,24 +693,24 @@ def find_optimal_retention(self):
         f_time = 25
         max_time = 1e10
 
-        type_block = dict()
-        type_count = dict()
-        type_time = dict()
-        last_t = self.type_sequence[0]
-        type_block[last_t] = 1
-        type_count[last_t] = 1
-        type_time[last_t] = self.time_sequence[0]
-        for i,t in enumerate(self.type_sequence[1:]):
-            type_count[t] = type_count.setdefault(t, 0) + 1
-            type_time[t] = type_time.setdefault(t, 0) + self.time_sequence[i]
-            if t != last_t:
-                type_block[t] = type_block.setdefault(t, 0) + 1
-            last_t = t
-
-        r_time = round(type_time[1]/type_count[1]/1000, 1)
-
-        if 2 in type_count and 2 in type_block:
-            f_time = round(type_time[2]/type_block[2]/1000 + r_time, 1)
+        state_block = dict()
+        state_count = dict()
+        state_duration = dict()
+        last_state = self.state_sequence[0]
+        state_block[last_state] = 1
+        state_count[last_state] = 1
+        state_duration[last_state] = self.duration_sequence[0]
+        for i, state in enumerate(self.state_sequence[1:]):
+            state_count[state] = state_count.setdefault(state, 0) + 1
+            state_duration[state] = state_duration.setdefault(state, 0) + self.duration_sequence[i]
+            if state != last_state:
+                state_block[state] = state_block.setdefault(state, 0) + 1
+            last_state = state
+
+        r_time = round(state_duration[Review]/state_count[Review]/1000, 1)
+
+        if Relearning in state_count and Relearning in state_block:
+            f_time = round(state_duration[Relearning]/state_block[Relearning]/1000 + r_time, 1)
 
         tqdm.write(f"average time for failed cards: {f_time}s")
         tqdm.write(f"average time for recalled cards: {r_time}s")
@@ -806,8 +809,8 @@ def evaluate(self):
         tmp['difficulty'] = tmp['difficulty'].map(lambda x: round(x, 2))
         tmp['p'] = tmp['p'].map(lambda x: round(x, 2))
         tmp['log_loss'] = tmp['log_loss'].map(lambda x: round(x, 2))
-        tmp.rename(columns={"r": "grade", "p": "retrievability"}, inplace=True)
-        tmp[['id', 'cid', 'review_date', 'r_history', 't_history', 'delta_t', 'grade', 'stability', 'difficulty', 'retrievability', 'log_loss']].to_csv("./evaluation.tsv", sep='\t', index=False)
+        tmp.rename(columns={"p": "retrievability"}, inplace=True)
+        tmp[['review_time', 'card_id', 'review_date', 'r_history', 't_history', 'delta_t', 'review_rating', 'stability', 'difficulty', 'retrievability', 'log_loss']].to_csv("./evaluation.tsv", sep='\t', index=False)
         del tmp
         return loss_before, loss_after