|
9 | 9 | from service.App import *
|
10 | 10 |
|
11 | 11 | """
|
12 |
| -This script is intended for creating one output file from multiple input data files. |
13 |
| -It is needed when we want to use additional data source in order to predict the main parameter. |
14 |
| -For example, in order to predict BTC price, we might want to add ETH prices. |
15 |
| -This script solves the following problems: |
16 |
| -- Input files might have the same column names (e.g., open, high, low, close) and therefore it adds prefixes to the columns of the output file |
17 |
| -- Input data may have gaps and therefore the script generates a regular time raster for the output file. The granularity of the time raster is determined by the parameter |
| 12 | +Create one output file from multiple input data files. |
18 | 13 | """
|
19 | 14 |
|
20 |
| - |
21 |
| -depth_file_names = [ # Leave empty to skip |
22 |
| - #r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch1.csv", |
23 |
| - #r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch2.csv", |
24 |
| - #r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch3.csv", |
25 |
| - #r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch4.csv", |
26 |
| - #r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch5.csv", |
27 |
| -] |
28 |
| - |
29 |
| - |
30 |
| -# |
31 |
| -# Readers from inputs files (DEPRECATED) |
32 |
| -# |
33 |
| - |
34 |
| -def load_futur_files(futur_file_path): |
35 |
| - """Return a data frame with future features.""" |
36 |
| - |
37 |
| - df = pd.read_csv(futur_file_path, parse_dates=['timestamp'], date_format="ISO8601") |
38 |
| - start = df["timestamp"].iloc[0] |
39 |
| - end = df["timestamp"].iloc[-1] |
40 |
| - |
41 |
| - df = df.set_index("timestamp") |
42 |
| - |
43 |
| - print(f"Loaded futur file with {len(df)} records in total. Range: ({start}, {end})") |
44 |
| - |
45 |
| - return df, start, end |
46 |
| - |
47 |
| - |
48 |
| -def load_kline_files(kline_file_path): |
49 |
| - """Return a data frame with kline features.""" |
50 |
| - |
51 |
| - df = pd.read_csv(kline_file_path, parse_dates=['timestamp'], date_format="ISO8601") |
52 |
| - start = df["timestamp"].iloc[0] |
53 |
| - end = df["timestamp"].iloc[-1] |
54 |
| - |
55 |
| - df = df.set_index("timestamp") |
56 |
| - |
57 |
| - print(f"Loaded kline file with {len(df)} records in total. Range: ({start}, {end})") |
58 |
| - |
59 |
| - return df, start, end |
60 |
| - |
61 |
| - |
62 |
| -def load_depth_files(): |
63 |
| - """Return a list of data frames with depth features.""" |
64 |
| - |
65 |
| - dfs = [] |
66 |
| - start = None |
67 |
| - end = None |
68 |
| - for depth_file_name in depth_file_names: |
69 |
| - df = pd.read_csv(depth_file_name, parse_dates=['timestamp'], date_format="ISO8601") |
70 |
| - # Start |
71 |
| - if start is None: |
72 |
| - start = df["timestamp"].iloc[0] |
73 |
| - elif df["timestamp"].iloc[0] < start: |
74 |
| - start = df["timestamp"].iloc[0] |
75 |
| - # End |
76 |
| - if end is None: |
77 |
| - end = df["timestamp"].iloc[-1] |
78 |
| - elif df["timestamp"].iloc[-1] > end: |
79 |
| - end = df["timestamp"].iloc[-1] |
80 |
| - |
81 |
| - df = df.set_index("timestamp") |
82 |
| - |
83 |
| - dfs.append(df) |
84 |
| - |
85 |
| - length = np.sum([len(df) for df in dfs]) |
86 |
| - print(f"Loaded {len(depth_file_names)} depth files with {length} records in total. Range: ({start}, {end})") |
87 |
| - |
88 |
| - return dfs, start, end |
89 |
| - |
90 |
| -# |
91 |
| -# Merger |
92 |
| -# |
93 |
| - |
94 |
| - |
95 | 15 | @click.command()
|
96 | 16 | @click.option('--config_file', '-c', type=click.Path(), default='', help='Configuration file name')
|
97 | 17 | def main(config_file):
|
@@ -204,6 +124,13 @@ def merge_data_sources(data_sources: list):
|
204 | 124 | # If different data sets have different semantics for timestamps, then data must be shifted accordingly
|
205 | 125 | df_out = df_out.join(ds["df"])
|
206 | 126 |
|
| 127 | + # Interpolate numeric columns |
| 128 | + merge_interpolate = App.config.get("merge_interpolate", False) |
| 129 | + if merge_interpolate: |
| 130 | + num_columns = df_out.select_dtypes((float, int)).columns.tolist() |
| 131 | + for col in num_columns: |
| 132 | + df_out[col] = df_out[col].interpolate() |
| 133 | + |
207 | 134 | return df_out
|
208 | 135 |
|
209 | 136 |
|
|
0 commit comments