|
| 1 | +# 数据划分 |
| 2 | + |
| 3 | +在算法模型的实验中,首先需要完成所需数据集的完成。`reclearn`在`reclearn/data/datasets`中提供了ml-1m、beauty、games、steam、criteo数据集的划分方法。当然读者也可以自己设计方案,本文主要对基本的方法进行简要说明。 |
| 4 | + |
| 5 | +数据划分主要完成以下任务: |
| 6 | + |
| 7 | +1. user、item索引的重新编码; |
| 8 | +2. 训练集、验证集、测试集的划分,分别进行存储; |
| 9 | +3. 记录user、item重新编码后的最大数值,为了指定embedding table的大小; |
| 10 | + |
| 11 | +ml-1m、beauty、games、steam数据集划分方式相差不大,统一进行说明。而Criteo由于其庞大的数据量,需要单独说明。 |
| 12 | + |
| 13 | + |
| 14 | + |
| 15 | +## Moivelens等 |
| 16 | + |
| 17 | +上述我们已经阐明了主要的任务,但在这基础上还需要注意以下几点: |
| 18 | + |
| 19 | +- 编码需要从1开始,0用于序列的填充。 |
| 20 | +- 取每个用户最后一次的交互行为作为测试集,最后第二次的行为作为验证集,其他作为训练集。 |
| 21 | + |
| 22 | +因此,例如`data/datasets/movielens.py`中的`split_data`方法: |
| 23 | + |
| 24 | +```python |
| 25 | +def split_data(file_path): |
| 26 | + """split movielens for general recommendation |
| 27 | + Args: |
| 28 | + :param file_path: A string. The file path of 'ratings.dat'. |
| 29 | + :return: train_path, val_path, test_path, meta_path |
| 30 | + """ |
| 31 | + dst_path = os.path.dirname(file_path) |
| 32 | + train_path = os.path.join(dst_path, "ml_train.txt") |
| 33 | + val_path = os.path.join(dst_path, "ml_val.txt") |
| 34 | + test_path = os.path.join(dst_path, "ml_test.txt") |
| 35 | + meta_path = os.path.join(dst_path, "ml_meta.txt") |
| 36 | + users, items = set(), set() |
| 37 | + history = {} |
| 38 | + with open(file_path, 'r') as f: |
| 39 | + lines = f.readlines() |
| 40 | + for line in tqdm(lines): |
| 41 | + user, item, score, timestamp = line.strip().split("::") |
| 42 | + users.add(int(user)) |
| 43 | + items.add(int(item)) |
| 44 | + history.setdefault(int(user), []) |
| 45 | + history[int(user)].append([item, timestamp]) |
| 46 | + random.shuffle(list(users)) |
| 47 | + with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: |
| 48 | + for user in users: |
| 49 | + hist = history[int(user)] |
| 50 | + hist.sort(key=lambda x: x[1]) |
| 51 | + for idx, value in enumerate(hist): |
| 52 | + if idx == len(hist) - 1: |
| 53 | + f3.write(str(user) + '\t' + value[0] + '\n') |
| 54 | + elif idx == len(hist) - 2: |
| 55 | + f2.write(str(user) + '\t' + value[0] + '\n') |
| 56 | + else: |
| 57 | + f1.write(str(user) + '\t' + value[0] + '\n') |
| 58 | + with open(meta_path, 'w') as f: |
| 59 | + f.write(str(max(users)) + '\t' + str(max(items))) |
| 60 | + return train_path, val_path, test_path, meta_path |
| 61 | +``` |
| 62 | + |
| 63 | +由于ml-1m数据集user、item的编码都是从1开始,因此无需重新编码。 |
| 64 | + |
| 65 | +当然在reclearn项目中大部分模型需要使用**用户的行为序列作为特征**,因此还提供了序列数据的划分方式。例如`data/datasets/games.py`中的`split_seq_data`方法: |
| 66 | + |
| 67 | +```python |
| 68 | +def split_seq_data(file_path): |
| 69 | + """split amazon games for sequence recommendation |
| 70 | + Args: |
| 71 | + :param file_path: A string. The file path of 'ratings_Beauty.dat'. |
| 72 | + :return: train_path, val_path, test_path, meta_path |
| 73 | + """ |
| 74 | + dst_path = os.path.dirname(file_path) |
| 75 | + train_path = os.path.join(dst_path, "games_seq_train.txt") |
| 76 | + val_path = os.path.join(dst_path, "games_seq_val.txt") |
| 77 | + test_path = os.path.join(dst_path, "games_seq_test.txt") |
| 78 | + meta_path = os.path.join(dst_path, "games_seq_meta.txt") |
| 79 | + users, items = set(), dict() |
| 80 | + user_idx, item_idx = 1, 1 |
| 81 | + history = {} |
| 82 | + with open(file_path, 'r') as f: |
| 83 | + lines = f.readlines() |
| 84 | + for line in tqdm(lines): |
| 85 | + user, item, score, timestamp = line.strip().split(",") |
| 86 | + users.add(user) |
| 87 | + if items.get(item) is None: |
| 88 | + items[item] = str(item_idx) |
| 89 | + item_idx += 1 |
| 90 | + history.setdefault(user, []) |
| 91 | + history[user].append([items[item], timestamp]) |
| 92 | + with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: |
| 93 | + for user in users: |
| 94 | + hist_u = history[user] |
| 95 | + if len(hist_u) < 4: |
| 96 | + continue |
| 97 | + hist_u.sort(key=lambda x: x[1]) |
| 98 | + hist = [x[0] for x in hist_u] |
| 99 | + time = [x[1] for x in hist_u] |
| 100 | + f1.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + '\n') |
| 101 | + f2.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + "\t" + hist[-2] + '\n') |
| 102 | + f3.write(str(user_idx) + "\t" + ' '.join(hist[:-1]) + "\t" + ' '.join(time[:-1]) + "\t" + hist[-1] + '\n') |
| 103 | + user_idx += 1 |
| 104 | + with open(meta_path, 'w') as f: |
| 105 | + f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) |
| 106 | + return train_path, val_path, test_path, meta_path |
| 107 | +``` |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | +## Criteo |
| 112 | + |
| 113 | +Criteo数据集大概有4500w的数据量,我们很难向处理moivelens那样将所有的数据都读如内存中。本文给出了两种处理方式: |
| 114 | + |
| 115 | +1. 只读取部分数据进行实验; |
| 116 | +2. 将数据集进行切分若干份,模型训练时分别读取,这样就避免了难以读入的问题; |
| 117 | + |
| 118 | +**第一种方法**在`data/datasets/criteo.py`的`create_small_criteo_dataset`方法中: |
| 119 | + |
| 120 | +```python |
| 121 | +def create_small_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2): |
| 122 | + """Load small criteo data(sample num) without splitting "train.txt". |
| 123 | + Note: If you want to load all data in the memory, please set "read_part" to False. |
| 124 | + Args: |
| 125 | + :param file: A string. dataset's path. |
| 126 | + :param embed_dim: A scalar. the embedding dimension of sparse features. |
| 127 | + :param read_part: A boolean. whether to read part of it. |
| 128 | + :param sample_num: A scalar. the number of instances if read_part is True. |
| 129 | + :param test_size: A scalar(float). ratio of test dataset. |
| 130 | + :return: feature columns such as [sparseFeature1, sparseFeature2, ...], |
| 131 | + train, such as ({'C1': [...], 'C2': [...]]}, [1, 0, 1, ...]) |
| 132 | + and test ({'C1': [...], 'C2': [...]]}, [1, 0, 1, ...]). |
| 133 | + """ |
| 134 | + if read_part: |
| 135 | + data_df = pd.read_csv(file, sep='\t', iterator=True, header=None, |
| 136 | + names=NAMES) |
| 137 | + data_df = data_df.get_chunk(sample_num) |
| 138 | + else: |
| 139 | + data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) |
| 140 | +``` |
| 141 | + |
| 142 | +通过`read_part`参数判断是否需要读取部分数据,`sample_num`为读取数据的总量。 |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | +**第二种方法**具体需要考虑的内容: |
| 147 | + |
| 148 | +1. 指定每个子数据集的数据量; |
| 149 | +2. 子数据集存储的位置以及命名方式; |
| 150 | +3. 如何实现数据集分割; |
| 151 | + |
| 152 | +通过`data/datasets/criteo.py`的`get_split_file_path`方法可以完成数据集的分割以及最终子数据集的相对存储路径获取: |
| 153 | + |
| 154 | +```python |
| 155 | +def get_split_file_path(parent_path=None, dataset_path=None, sample_num=5000000): |
| 156 | + """Get the list of split file path. |
| 157 | + Note: Either parent_path or dataset_path must be valid. |
| 158 | + If exists dataset_path + "/split", parent_path = dataset_path + "/split". |
| 159 | + Args: |
| 160 | + :param parent_path: A string. split file's parent path. |
| 161 | + :param dataset_path: A string. |
| 162 | + :param sample_num: A int. The sample number of every split file. |
| 163 | + :return: A list. [file1_path, file2_path, ...] |
| 164 | + """ |
| 165 | + sub_dir_name = 'split' |
| 166 | + if parent_path is None and dataset_path is None: |
| 167 | + raise ValueError('Please give parent path or file path.') |
| 168 | + if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): |
| 169 | + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) |
| 170 | + elif parent_path is None or not os.path.exists(parent_path): |
| 171 | + splitByLineCount(dataset_path, sample_num, sub_dir_name) |
| 172 | + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) |
| 173 | + split_file_name = os.listdir(parent_path) |
| 174 | + split_file_name.sort() |
| 175 | + split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] |
| 176 | + return split_file_list |
| 177 | +``` |
| 178 | + |
| 179 | +`parent_path`为存储子数据集的路径,`dataset_path`为原始数据集的路径,这里也判断了子数据集是否已经分割好,若之前已经完成,则可以直接获取子数据集的列表路径,避免重复的完成任务。若没有完成,我没通过`data/utils.py`的`splitByLineCount`方法完成: |
| 180 | + |
| 181 | +```python |
| 182 | +def splitByLineCount(filename, count, sub_dir_name): |
| 183 | + """Split File. |
| 184 | + Note: You can specify how many rows of data each sub file contains. |
| 185 | + Args: |
| 186 | + :param filename: A string. |
| 187 | + :param count: A scalar(int). |
| 188 | + :param sub_dir_name: A string. |
| 189 | + :return: |
| 190 | + """ |
| 191 | + f = open(filename, 'r') |
| 192 | + try: |
| 193 | + head = f.readline() |
| 194 | + buf = [] |
| 195 | + sub = 1 |
| 196 | + for line in f: |
| 197 | + buf.append(line) |
| 198 | + if len(buf) == count: |
| 199 | + sub = mkSubFile(buf, head, filename, sub_dir_name, sub) |
| 200 | + buf = [] |
| 201 | + if len(buf) != 0: |
| 202 | + mkSubFile(buf, head, filename, sub_dir_name, sub) |
| 203 | + finally: |
| 204 | + f.close() |
| 205 | +``` |
| 206 | + |
| 207 | +其中`mkSubFile`方法对当前达标目标说的内容`buf`进行存储,存储名字通过`sub`来指定。 |
| 208 | + |
| 209 | +```python |
| 210 | +def mkSubFile(lines, head, srcName, sub_dir_name, sub): |
| 211 | + """Write sub-data. |
| 212 | + Args: |
| 213 | + :param lines: A list. Several pieces of data. |
| 214 | + :param head: A string. ['label', 'I1', 'I2', ...]. |
| 215 | + :param srcName: A string. The name of data. |
| 216 | + :param sub_dir_name: A string. |
| 217 | + :param sub: A scalar(Int). Record the current number of sub file. |
| 218 | + :return: sub + 1. |
| 219 | + """ |
| 220 | + root_path, file = os.path.split(srcName) |
| 221 | + file_name, suffix = file.split('.') |
| 222 | + split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix |
| 223 | + split_file = os.path.join(root_path, sub_dir_name, split_file_name) |
| 224 | + if not os.path.exists(os.path.join(root_path, sub_dir_name)): |
| 225 | + os.mkdir(os.path.join(root_path, sub_dir_name)) |
| 226 | + print('make file: %s' % split_file) |
| 227 | + f = open(split_file, 'w') |
| 228 | + try: |
| 229 | + f.writelines([head]) |
| 230 | + f.writelines(lines) |
| 231 | + return sub + 1 |
| 232 | + finally: |
| 233 | + f.close() |
| 234 | +``` |
| 235 | + |
| 236 | +以上就完成了大数据集的切分,我们可以挑选任意一个子数据集作为测试集。 |
0 commit comments