|
79 | 79 | }, |
80 | 80 | "outputs": [], |
81 | 81 | "source": [ |
82 | | - "%pip install datasetsforecast==0.0.8 --quiet\n", |
| 82 | + "%pip install datasetsforecast==0.0.8 pandas==2.2.3 --quiet\n", |
83 | 83 | "dbutils.library.restartPython()" |
84 | 84 | ] |
85 | 85 | }, |
|
174 | 174 | "\n", |
175 | 175 | "def create_m4_monthly():\n", |
176 | 176 | " y_df, _, _ = M4.load(directory=str(pathlib.Path.home()), group=\"Monthly\")\n", |
177 | | - " _ids = [f\"M{i}\" for i in range(1, n + 1)]\n", |
| 177 | + " target_ids = {f\"M{i}\" for i in range(1, n)}\n", |
| 178 | + " y_df = y_df[y_df[\"unique_id\"].isin(target_ids)]\n", |
178 | 179 | " y_df = (\n", |
179 | | - " y_df.groupby(\"unique_id\")\n", |
180 | | - " .filter(lambda x: x.unique_id.iloc[0] in _ids)\n", |
181 | | - " .groupby(\"unique_id\")\n", |
182 | | - " .apply(transform_group)\n", |
183 | | - " .reset_index(drop=True)\n", |
| 180 | + " y_df.groupby(\"unique_id\", group_keys=False)\n", |
| 181 | + " .apply(lambda g: transform_group(g, g.name))\n", |
| 182 | + " .reset_index(drop=True)\n", |
184 | 183 | " )\n", |
185 | 184 | " return y_df\n", |
186 | 185 | "\n", |
187 | 186 | "\n", |
188 | | - "def transform_group(df):\n", |
189 | | - " unique_id = df.unique_id.iloc[0]\n", |
190 | | - " _cnt = 60 # df.count()[0]\n", |
191 | | - " _start = pd.Timestamp(\"2018-01-01\")\n", |
192 | | - " _end = _start + pd.DateOffset(months=_cnt)\n", |
193 | | - " date_idx = pd.date_range(start=_start, end=_end, freq=\"M\", name=\"date\")\n", |
194 | | - " _df = (\n", |
195 | | - " pd.DataFrame(data=[], index=date_idx)\n", |
196 | | - " .reset_index()\n", |
197 | | - " .rename(columns={\"index\": \"date\"})\n", |
198 | | - " )\n", |
199 | | - " _df[\"unique_id\"] = unique_id\n", |
200 | | - " _df[\"y\"] = df[:60].y.values\n", |
201 | | - " return _df\n" |
| 187 | + "def transform_group(df, unique_id):\n", |
| 188 | + " if len(df) > 60:\n", |
| 189 | + " df = df.iloc[-60:]\n", |
| 190 | + " start = pd.Timestamp(\"2018-01-01\")\n", |
| 191 | + " date_idx = pd.date_range(start=start, periods=len(df), freq=\"ME\", name=\"ds\")\n", |
| 192 | + " res_df = pd.DataFrame({\n", |
| 193 | + " \"ds\": date_idx,\n", |
| 194 | + " \"unique_id\": unique_id,\n", |
| 195 | + " \"y\": df[\"y\"].to_numpy()\n", |
| 196 | + " })\n", |
| 197 | + " return res_df" |
202 | 198 | ] |
203 | 199 | }, |
204 | 200 | { |
|
309 | 305 | }, |
310 | 306 | "outputs": [], |
311 | 307 | "source": [ |
312 | | - "display(spark.sql(f\"select unique_id, count(date) as count from {catalog}.{db}.m4_monthly_train group by unique_id order by unique_id\"))" |
| 308 | + "display(spark.sql(f\"select unique_id, count(ds) as count from {catalog}.{db}.m4_monthly_train group by unique_id order by unique_id\"))" |
313 | 309 | ] |
314 | 310 | }, |
315 | 311 | { |
|
331 | 327 | "outputs": [], |
332 | 328 | "source": [ |
333 | 329 | "display(\n", |
334 | | - " spark.sql(f\"select * from {catalog}.{db}.m4_monthly_train where unique_id in ('M1', 'M2', 'M3', 'M4', 'M5') order by unique_id, date\")\n", |
| 330 | + " spark.sql(f\"select * from {catalog}.{db}.m4_monthly_train where unique_id in ('M1', 'M2', 'M3', 'M4', 'M5') order by unique_id, ds\")\n", |
335 | 331 | " )" |
336 | 332 | ] |
337 | 333 | }, |
|
547 | 543 | "display(spark.sql(f\"\"\"\n", |
548 | 544 | " select * from {catalog}.{db}.monthly_scoring_output \n", |
549 | 545 | " where unique_id = 'M1'\n", |
550 | | - " order by unique_id, model, date\n", |
| 546 | + " order by unique_id, model, ds\n", |
551 | 547 | " \"\"\"))" |
552 | 548 | ] |
553 | 549 | }, |
|
0 commit comments