@@ -47,32 +47,17 @@ def process_simple(
4747 train = interactions [interactions ["timestamp" ] < split_date ]
4848 val = interactions [interactions ["timestamp" ] >= split_date ]
4949
50- # Keep train users with condition 0 < mean_target < 1
51- train_active_users = (
52- train .groupby ("user " )
50+ # Keep lists with condition 0 < mean_target < 1
51+ train_valid_lists = (
52+ train .groupby ("list_id " )
5353 .agg ({"target" : "mean" })
5454 .rename (columns = {"target" : "mean" })
5555 .reset_index ()
5656 )
57- train_active_users = train_active_users [
58- (train_active_users ["mean" ] > 0 ) & (train_active_users ["mean" ] < 1 )
57+ train_valid_lists = train_valid_lists [
58+ (train_valid_lists ["mean" ] > 0 ) & (train_valid_lists ["mean" ] < 1 )
5959 ]
60- train = train .merge (train_active_users , "inner" , "user" )
61-
62- # Limit val users to these who occured in train data and also keep val users with 0 < mean_target < 1
63- # This way val users is a subset of train users (no cold users in val!)
64- # and we have train users with number of clicks at least 1 and val users with number of clicks at least 2
65- # val = val.merge(train_active_users, "inner", "user")
66- val_active_users = (
67- val .groupby ("user" )
68- .agg ({"target" : "mean" })
69- .rename (columns = {"target" : "mean" })
70- .reset_index ()
71- )
72- val_active_users = val_active_users [
73- (val_active_users ["mean" ] > 0 ) & (val_active_users ["mean" ] < 1 )
74- ]
75- val = val .merge (val_active_users , "inner" , "user" )
60+ train = train .merge (train_valid_lists , "inner" , "list_id" )
7661
7762 # Prepare user/item to idx mappers based on train data
7863 unique_train_users = train ["user" ].unique ()
@@ -84,20 +69,32 @@ def process_simple(
8469 {"item" : unique_train_items , "item_idx" : np .arange (unique_train_items .size )}
8570 )
8671
87- # Map user/item to idx
72+ # Map user/item to idx - it removes cold users and items from validation
8873 train = train .merge (user_mapper , on = "user" , how = "inner" )
8974 train = train .merge (item_mapper , on = "item" , how = "inner" )
9075 val = val .merge (user_mapper , on = "user" , how = "inner" )
9176 val = val .merge (item_mapper , on = "item" , how = "inner" )
9277
78+ # Keep lists with condition 0 < mean_target < 1
79+ val_valid_lists = (
80+ val .groupby ("list_id" )
81+ .agg ({"target" : "mean" })
82+ .rename (columns = {"target" : "mean" })
83+ .reset_index ()
84+ )
85+ val_valid_lists = val_valid_lists [
86+ (val_valid_lists ["mean" ] > 0 ) & (val_valid_lists ["mean" ] < 1 )
87+ ]
88+ val = val .merge (val_valid_lists , "inner" , "list_id" )
89+
9390 train = train .sort_values ("timestamp" ).reset_index (drop = True )
9491 val = val .sort_values ("timestamp" ).reset_index (drop = True )
9592
9693 # Select valid columns
97- train = train [["timestamp" , "user_idx" , "item_idx" , "target" ]]
98- train .columns = ["timestamp" , "user" , "item" , "target" ]
99- val = val [["timestamp" , "user_idx" , "item_idx" , "target" ]]
100- val .columns = ["timestamp" , "user" , "item" , "target" ]
94+ train = train [["timestamp" , "list_id" , " user_idx" , "item_idx" , "target" ]]
95+ train .columns = ["timestamp" , "list_id" , " user" , "item" , "target" ]
96+ val = val [["timestamp" , "list_id" , " user_idx" , "item_idx" , "target" ]]
97+ val .columns = ["timestamp" , "list_id" , " user" , "item" , "target" ]
10198
10299 # Mock test_data
103100 test = val .copy () # test set == validation set (should be changed in the future!)
@@ -108,11 +105,13 @@ def process_simple(
108105 stats = {}
109106 stats ["train_n_users" ] = unique_train_users .size
110107 stats ["train_n_items" ] = unique_train_items .size
108+ stats ["train_n_lists" ] = train ["list_id" ].nunique ()
111109 stats ["train_n_clicks" ] = int (train ["target" ].sum ())
112110 stats ["train_n_impressions" ] = len (train ) - stats ["train_n_clicks" ]
113111 stats ["train_ctr" ] = stats ["train_n_clicks" ] / stats ["train_n_impressions" ]
114112 stats ["val_n_users" ] = unique_val_users .size
115113 stats ["val_n_items" ] = unique_val_items .size
114+ stats ["val_n_lists" ] = val ["list_id" ].nunique ()
116115 stats ["val_n_clicks" ] = int (val ["target" ].sum ())
117116 stats ["val_n_impressions" ] = len (val ) - stats ["val_n_clicks" ]
118117 stats ["val_ctr" ] = stats ["val_n_clicks" ] / stats ["val_n_impressions" ]
@@ -134,17 +133,6 @@ def process_bpr(
134133 train = tmp0 .merge (tmp1 , "inner" , "user" , suffixes = ("_neg" , "_pos" ))
135134 val = interactions [interactions ["timestamp" ] >= split_date ]
136135
137- val_active_users = (
138- val .groupby ("user" )
139- .agg ({"target" : "mean" })
140- .rename (columns = {"target" : "mean" })
141- .reset_index ()
142- )
143- val_active_users = val_active_users [
144- (val_active_users ["mean" ] > 0 ) & (val_active_users ["mean" ] < 1 )
145- ]
146- val = val .merge (val_active_users , "inner" , "user" )
147-
148136 # Prepare user/item to idx mappers based on train data
149137 unique_train_users = train ["user" ].unique ()
150138 # unique_users = train["user"].unique()
@@ -175,9 +163,21 @@ def process_bpr(
175163
176164 val = val .merge (user_mapper , on = "user" , how = "inner" )
177165 val = val .merge (item_mapper , on = "item" , how = "inner" )
178- val = val [["user_idx" , "item_idx" , "target" ]].rename (
179- columns = {"user_idx" : "user" , "item_idx" : "item" }
166+
167+ # Keep lists with condition 0 < mean_target < 1
168+ val_valid_lists = (
169+ val .groupby ("list_id" )
170+ .agg ({"target" : "mean" })
171+ .rename (columns = {"target" : "mean" })
172+ .reset_index ()
180173 )
174+ val_valid_lists = val_valid_lists [
175+ (val_valid_lists ["mean" ] > 0 ) & (val_valid_lists ["mean" ] < 1 )
176+ ]
177+ val = val .merge (val_valid_lists , "inner" , "list_id" )
178+
179+ val = val [["timestamp" , "list_id" , "user_idx" , "item_idx" , "target" ]]
180+ val = val .rename (columns = {"user_idx" : "user" , "item_idx" : "item" })
181181
182182 # Mock test_data
183183 test = val .copy () # test set == validation set (to change in the future!)
@@ -190,6 +190,7 @@ def process_bpr(
190190 stats ["train_n_items" ] = unique_train_items .size
191191 stats ["val_n_users" ] = unique_val_users .size
192192 stats ["val_n_items" ] = unique_val_items .size
193+ stats ["val_n_lists" ] = val ["list_id" ].nunique ()
193194 stats ["val_n_clicks" ] = int (val ["target" ].sum ())
194195 stats ["val_n_impressions" ] = len (val ) - stats ["val_n_clicks" ]
195196 stats ["val_ctr" ] = stats ["val_n_clicks" ] / stats ["val_n_impressions" ]
@@ -257,6 +258,11 @@ def _common(
257258 # Join positive interactions (clicks) with negative interactions (impressions)
258259 interactions = interactions .merge (impressions_dl , "inner" , "recommendation_id" )
259260
261+ # Create unique id per (recommandation_id, user_id) pairs
262+ interactions ["list_id" ] = pd .factorize (
263+ interactions [["recommendation_id" , "user_id" ]].apply (tuple , axis = 1 )
264+ )[0 ]
265+
260266 # Mark positive interactions with 1 and negative with 0
261267 interactions ["target" ] = np .where (
262268 interactions ["series_id" ] == interactions ["recommended_series_list" ],
@@ -266,8 +272,14 @@ def _common(
266272 interactions ["target" ] = interactions ["target" ].astype ("int32" )
267273
268274 interactions = interactions [
269- ["utc_ts_milliseconds" , "user_id" , "recommended_series_list" , "target" ]
275+ [
276+ "utc_ts_milliseconds" ,
277+ "list_id" ,
278+ "user_id" ,
279+ "recommended_series_list" ,
280+ "target" ,
281+ ]
270282 ]
271- interactions .columns = ["timestamp" , "user" , "item" , "target" ]
283+ interactions .columns = ["timestamp" , "list_id" , " user" , "item" , "target" ]
272284
273285 return interactions
0 commit comments