-
Notifications
You must be signed in to change notification settings - Fork 33
/
encode.py
113 lines (98 loc) · 4.14 KB
/
encode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Encoding features for Knowledge Tracing Machines.
Select a <dataset> and the features you want to include.
Case 1: There is only one skill per item.
=======
data/<dataset>/data.csv should contain the following columns:
user, item, skill, correct, wins, fails
where wins and fails are the number of successful and unsuccessful
attempts at that skill.
Case 2: There may be several skills associated to an item.
=======
data/<dataset>/data.csv just needs to contain:
user, item, correct
and data/<dataset>/q_mat.npz should be a q-matrix under scipy.sparse format.
If you want to compute wins and fails like in PFA,
you should run encode.py with --skills --wins --fails.
It is a bit slow for the moment.
Note: IDs for users and items need not be disjoint,
as we use hstack below to concatenate sparse matrices.
See paper: https://arxiv.org/abs/1811.03388
Authors: Jill-Jênn Vie, 2024
"""
import argparse
import logging
import os
from collections import Counter
from scipy.sparse import coo_matrix, save_npz, load_npz, hstack, vstack
import pandas as pd
import numpy as np
parser = argparse.ArgumentParser(description='Encode datasets')
parser.add_argument('--dataset', type=str, nargs='?', default='dummy')
parser.add_argument('--users', type=bool, nargs='?', const=True, default=False)
parser.add_argument('--items', type=bool, nargs='?', const=True, default=False)
parser.add_argument('--skills', type=bool, nargs='?', const=True,
default=False)
parser.add_argument('--wins', type=bool, nargs='?', const=True, default=False)
parser.add_argument('--fails', type=bool, nargs='?', const=True, default=False)
parser.add_argument('--extra', type=str, nargs='?', default='')
options = parser.parse_args()
os.chdir(os.path.join('data', options.dataset)) # Move to dataset folder
all_features = ['users', 'items', 'skills', 'wins', 'fails']
active_features = [features for features in all_features
if vars(options)[features]]
features_suffix = ''.join([features[0] for features in active_features])
def onehotize(col):
depth = 1 + col.max()
nb_events = len(col)
rows = list(range(nb_events))
return coo_matrix(([1] * nb_events, (rows, col)), shape=(nb_events, depth))
def df_to_sparse(df, active_features):
'''
Prepare sparse features
'''
X = {}
X['users'] = onehotize(df['user'])
if 'item' in df.columns:
X['items'] = onehotize(df['item'])
if options.extra:
for column in options.extra.split(','):
X[column] = onehotize(df[column])
if 'skill' in df:
X['skills'] = onehotize(df['skill'])
X['wins'] = X['skills'].copy()
X['wins'].data = df['wins']
X['fails'] = X['skills'].copy()
X['fails'].data = df['fails']
elif os.path.isfile('q_mat.npz'): # Have to recompute counts
q_matrix = load_npz('q_mat.npz')
_, nb_skills = q_matrix.shape
df['item0'] = np.unique(df['item'], return_inverse=True)[1]
X['skills'] = q_matrix[df['item0']]
if 'wins' in active_features:
wins_rows = []
fails_rows = []
wins = np.zeros(nb_skills)
fails = np.zeros(nb_skills)
for item_id, correct in zip(df['item0'], df['correct']):
wins_rows.append(q_matrix[item_id].multiply(wins))
fails_rows.append(q_matrix[item_id].multiply(fails))
if correct:
wins += q_matrix[item_id].toarray().reshape(-1)
else:
fails += q_matrix[item_id].toarray().reshape(-1)
X['wins'] = vstack(wins_rows)
X['fails'] = vstack(fails_rows)
print('nb skills', Counter(X['skills'].sum(axis=1).A1))
X_train = hstack([X[agent] for agent in active_features]).tocsr()
y_train = df['correct'].values
return X_train, y_train
df = pd.read_csv('data.csv')
X, y = df_to_sparse(df, active_features)
logging.warning(df.head())
if options.dataset == 'dummy':
print(X.todense())
save_npz(f'X-{features_suffix}.npz', X)
np.save(f'y-{features_suffix}.npy', y)
logging.warning('Successfully created X-%s.npz and y-%s.npy in data/%s folder',
features_suffix, features_suffix, options.dataset)