Skip to content

Commit c994722

Browse files
committed
remove unused code and add a parameter for interpolation
1 parent 5135f06 commit c994722

File tree

1 file changed

+8
-81
lines changed

1 file changed

+8
-81
lines changed

scripts/merge.py

+8-81
Original file line numberDiff line numberDiff line change
@@ -9,89 +9,9 @@
99
from service.App import *
1010

1111
"""
12-
This script is intended for creating one output file from multiple input data files.
13-
It is needed when we want to use additional data source in order to predict the main parameter.
14-
For example, in order to predict BTC price, we might want to add ETH prices.
15-
This script solves the following problems:
16-
- Input files might have the same column names (e.g., open, high, low, close) and therefore it adds prefixes to the columns of the output file
17-
- Input data may have gaps and therefore the script generates a regular time raster for the output file. The granularity of the time raster is determined by the parameter
12+
Create one output file from multiple input data files.
1813
"""
1914

20-
21-
depth_file_names = [ # Leave empty to skip
22-
#r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch1.csv",
23-
#r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch2.csv",
24-
#r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch3.csv",
25-
#r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch4.csv",
26-
#r"C:\DATA2\BITCOIN\GENERATED\depth-BTCUSDT-batch5.csv",
27-
]
28-
29-
30-
#
31-
# Readers from inputs files (DEPRECATED)
32-
#
33-
34-
def load_futur_files(futur_file_path):
35-
"""Return a data frame with future features."""
36-
37-
df = pd.read_csv(futur_file_path, parse_dates=['timestamp'], date_format="ISO8601")
38-
start = df["timestamp"].iloc[0]
39-
end = df["timestamp"].iloc[-1]
40-
41-
df = df.set_index("timestamp")
42-
43-
print(f"Loaded futur file with {len(df)} records in total. Range: ({start}, {end})")
44-
45-
return df, start, end
46-
47-
48-
def load_kline_files(kline_file_path):
49-
"""Return a data frame with kline features."""
50-
51-
df = pd.read_csv(kline_file_path, parse_dates=['timestamp'], date_format="ISO8601")
52-
start = df["timestamp"].iloc[0]
53-
end = df["timestamp"].iloc[-1]
54-
55-
df = df.set_index("timestamp")
56-
57-
print(f"Loaded kline file with {len(df)} records in total. Range: ({start}, {end})")
58-
59-
return df, start, end
60-
61-
62-
def load_depth_files():
63-
"""Return a list of data frames with depth features."""
64-
65-
dfs = []
66-
start = None
67-
end = None
68-
for depth_file_name in depth_file_names:
69-
df = pd.read_csv(depth_file_name, parse_dates=['timestamp'], date_format="ISO8601")
70-
# Start
71-
if start is None:
72-
start = df["timestamp"].iloc[0]
73-
elif df["timestamp"].iloc[0] < start:
74-
start = df["timestamp"].iloc[0]
75-
# End
76-
if end is None:
77-
end = df["timestamp"].iloc[-1]
78-
elif df["timestamp"].iloc[-1] > end:
79-
end = df["timestamp"].iloc[-1]
80-
81-
df = df.set_index("timestamp")
82-
83-
dfs.append(df)
84-
85-
length = np.sum([len(df) for df in dfs])
86-
print(f"Loaded {len(depth_file_names)} depth files with {length} records in total. Range: ({start}, {end})")
87-
88-
return dfs, start, end
89-
90-
#
91-
# Merger
92-
#
93-
94-
9515
@click.command()
9616
@click.option('--config_file', '-c', type=click.Path(), default='', help='Configuration file name')
9717
def main(config_file):
@@ -204,6 +124,13 @@ def merge_data_sources(data_sources: list):
204124
# If different data sets have different semantics for timestamps, then data must be shifted accordingly
205125
df_out = df_out.join(ds["df"])
206126

127+
# Interpolate numeric columns
128+
merge_interpolate = App.config.get("merge_interpolate", False)
129+
if merge_interpolate:
130+
num_columns = df_out.select_dtypes((float, int)).columns.tolist()
131+
for col in num_columns:
132+
df_out[col] = df_out[col].interpolate()
133+
207134
return df_out
208135

209136

0 commit comments

Comments
 (0)