Skip to content

Commit 8cce390

Browse files
Merge pull request #46 from gregory-halverson/main
v2.11.0 robust input dataset
2 parents a0f0145 + df92761 commit 8cce390

File tree

46 files changed

+226042
-219916
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+226042
-219916
lines changed

FLiESANN/ECOv002-cal-val-FLiESANN-inputs.csv

Lines changed: 1065 additions & 1065 deletions
Large diffs are not rendered by default.

FLiESANN/ECOv002-cal-val-FLiESANN-outputs.csv

Lines changed: 1066 additions & 1066 deletions
Large diffs are not rendered by default.

FLiESANN/FLiESANN.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
from .prepare_FLiESANN_inputs import prepare_FLiESANN_inputs
77
from .run_FLiESANN_inference import run_FLiESANN_inference
88
from .process_FLiESANN import FLiESANN
9-
from .generate_FLiESANN_inputs_table import generate_FLiES_inputs_table
9+
from .generate_FLiESANN_inputs_table_deprecated import generate_FLiES_inputs_table
1010
from .process_FLiESANN_table import process_FLiESANN_table
1111
from .ECOv002_static_tower_FLiESANN_inputs import load_ECOv002_static_tower_FLiESANN_inputs
1212
from .ECOv002_calval_FLiESANN_inputs import load_ECOv002_calval_FLiESANN_inputs
1313
from .ECOv002_calval_FLiESANN_outputs import load_ECOv002_calval_FLiESANN_outputs
1414
from .verify import verify
1515
from .retrieve_FLiESANN_GEOS5FP_inputs import retrieve_FLiESANN_GEOS5FP_inputs
16+
from .generate_FLiESANN_inputs_table import generate_FLiESANN_inputs_table
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import numpy as np
2+
import pandas as pd
3+
import shapely
4+
import rasters as rt
5+
6+
7+
def filter_dataframe_to_location_time_pairs(df, geometry, time_UTC):
8+
"""Filter DataFrame or 2D array returned from GEOS5FP to match original location-time pairs"""
9+
10+
# Handle DataFrame
11+
if isinstance(df, pd.DataFrame):
12+
if len(df) == len(geometry.geoms):
13+
return df
14+
15+
# Extract first column (data values) from DataFrame
16+
data_array = df.iloc[:, 0].values.astype(np.float32)
17+
# Handle 2D numpy array
18+
elif isinstance(df, np.ndarray) and len(df.shape) == 2:
19+
if df.shape[0] == (len(geometry.geoms) if hasattr(geometry, 'geoms') else 0):
20+
return df
21+
22+
# Extract first column if multiple columns
23+
if df.shape[1] > 1:
24+
data_array = df[:, 0].astype(np.float32)
25+
else:
26+
data_array = df.flatten().astype(np.float32)
27+
# Handle 1D array or scalar
28+
else:
29+
return df
30+
31+
# Get coordinates of input geometry points
32+
if isinstance(geometry, (shapely.geometry.MultiPoint, rt.MultiPoint)):
33+
input_coords = [(geom.x, geom.y) for geom in geometry.geoms]
34+
else:
35+
return df
36+
37+
# Convert time_UTC to array if it's a single value
38+
if not hasattr(time_UTC, '__len__'):
39+
time_UTC_array = [time_UTC] * len(input_coords)
40+
else:
41+
time_UTC_array = time_UTC
42+
43+
# For each input location-time pair, find the matching DataFrame row
44+
# GEOS5FP processes unique times and returns data for all locations at each time
45+
# We need to select only the rows that match our specific location-time pairs
46+
47+
# Create a mapping of (lat, lon, time_index) -> row index
48+
# The DataFrame contains all locations for each unique time
49+
unique_times = sorted(set(pd.to_datetime(time_UTC_array)))
50+
time_to_index = {t: i for i, t in enumerate(unique_times)}
51+
52+
selected_rows = []
53+
for i, (coord, time_val) in enumerate(zip(input_coords, time_UTC_array)):
54+
time_val = pd.to_datetime(time_val)
55+
time_idx = time_to_index[time_val]
56+
# Row index = time_idx * num_locations + location_idx
57+
row_idx = time_idx * len(input_coords) + i
58+
if row_idx < len(data_array):
59+
selected_rows.append(row_idx)
60+
61+
if len(selected_rows) == len(input_coords):
62+
return data_array[selected_rows].astype(np.float32)
63+
else:
64+
# Fallback: return all rows if filtering fails
65+
return data_array.astype(np.float32)
Lines changed: 127 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -1,162 +1,139 @@
1-
from dateutil import parser
1+
import logging
22

33
import numpy as np
44
import pandas as pd
5-
5+
import rasters as rt
6+
from dateutil import parser
7+
from pandas import DataFrame
8+
from rasters import MultiPoint, WGS84
9+
from shapely.geometry import Point
610
from GEOS5FP import GEOS5FP
7-
from sentinel_tiles import sentinel_tiles
8-
from sun_angles import calculate_SZA_from_datetime
9-
from koppengeiger import load_koppen_geiger
10-
11-
import logging
11+
from NASADEM import NASADEMConnection
12+
from .retrieve_FLiESANN_inputs import retrieve_FLiESANN_inputs
1213

1314
logger = logging.getLogger(__name__)
1415

15-
def generate_FLiES_inputs_table(
16-
FLiES_inputs_from_calval_df: pd.DataFrame,
17-
GEOS5FP_connection: GEOS5FP = None) -> pd.DataFrame:
18-
"""
19-
FLiES_inputs_from_claval_df:
20-
Pandas DataFrame containing the columns: tower, lat, lon, time_UTC, albedo, elevation_km
21-
return:
22-
Pandas DataFrame containing the columns: tower, lat, lon, time_UTC, doy, albedo, elevation_km, AOT, COT, vapor_gccm, ozone_cm, SZA, KG
16+
def generate_FLiESANN_inputs_table(
17+
input_df: DataFrame,
18+
GEOS5FP_connection: GEOS5FP = None,
19+
NASADEM_connection: NASADEMConnection = None) -> DataFrame:
2320
"""
24-
if GEOS5FP_connection is None:
25-
GEOS5FP_connection = GEOS5FP()
26-
27-
# output_rows = []
28-
FLiES_inputs_df = FLiES_inputs_from_calval_df.copy()
29-
30-
doy = []
31-
AOT = []
32-
COT = []
33-
vapor_gccm = []
34-
ozone_cm = []
35-
SWin = []
36-
Tmin_K = []
37-
SZA = []
38-
KG = []
39-
40-
for i, input_row in FLiES_inputs_from_calval_df.iterrows():
41-
tower = input_row.tower
42-
lat = input_row.lat
43-
lon = input_row.lon
44-
time_UTC = input_row.time_UTC
45-
albedo = input_row.albedo
46-
elevation_km = input_row.elevation_km
47-
logger.info(f"collecting FLiES inputs for tower {tower} lat {lat} lon {lon} time {time_UTC} UTC")
48-
time_UTC = parser.parse(str(time_UTC))
49-
doy.append(time_UTC.timetuple().tm_yday)
50-
date_UTC = time_UTC.date()
51-
tile = sentinel_tiles.toMGRS(lat, lon)[:5]
52-
53-
try:
54-
tile_grid = sentinel_tiles.grid(tile=tile, cell_size=70)
55-
except Exception as e:
56-
logger.error(e)
57-
logger.warning(f"unable to process tile {tile}")
58-
AOT.append(np.nan)
59-
COT.append(np.nan)
60-
vapor_gccm.append(np.nan)
61-
ozone_cm.append(np.nan)
62-
SZA.append(np.nan)
63-
KG.append(np.nan)
64-
continue
65-
66-
rows, cols = tile_grid.shape
67-
row, col = tile_grid.index_point(rt.Point(lon, lat))
68-
geometry = tile_grid[max(0, row - 1):min(row + 2, rows - 1),
69-
max(0, col - 1):min(col + 2, cols - 1)]
70-
71-
if not "AOT" in FLiES_inputs_from_calval_df.columns:
72-
try:
73-
logger.info("retrieving GEOS-5 FP aerosol optical thickness raster")
74-
AOT.append(np.nanmedian(GEOS5FP_connection.AOT(time_UTC=time_UTC, geometry=geometry)))
75-
except Exception as e:
76-
AOT.append(np.nan)
77-
logger.exception(e)
78-
79-
if not "COT" in FLiES_inputs_from_calval_df.columns:
80-
try:
81-
logger.info("generating GEOS-5 FP cloud optical thickness raster")
82-
COT.append(np.nanmedian(GEOS5FP_connection.COT(time_UTC=time_UTC, geometry=geometry)))
83-
except Exception as e:
84-
COT.append(np.nan)
85-
logger.exception(e)
86-
87-
if not "vapor_gccm" in FLiES_inputs_from_calval_df.columns:
88-
try:
89-
logger.info("generating GEOS5-FP water vapor raster in grams per square centimeter")
90-
vapor_gccm.append(np.nanmedian(GEOS5FP_connection.vapor_gccm(time_UTC=time_UTC, geometry=geometry)))
91-
except Exception as e:
92-
vapor_gccm.append(np.nan)
93-
logger.exception(e)
94-
95-
if not "ozone_cm" in FLiES_inputs_from_calval_df.columns:
96-
try:
97-
logger.info("generating GEOS5-FP ozone raster in grams per square centimeter")
98-
ozone_cm.append(np.nanmedian(GEOS5FP_connection.ozone_cm(time_UTC=time_UTC, geometry=geometry)))
99-
except Exception as e:
100-
ozone_cm.append(np.nan)
101-
logger.exception(e)
102-
103-
if not "SWin" in FLiES_inputs_from_calval_df.columns:
104-
try:
105-
logger.info("generating GEOS5-FP incoming solar radiation raster in watts per square meter")
106-
SWin.append(np.nanmedian(GEOS5FP_connection.SWin(time_UTC=time_UTC, geometry=geometry)))
107-
except Exception as e:
108-
SWin.append(np.nan)
109-
logger.exception(e)
110-
111-
if not "Tmin_K" in FLiES_inputs_from_calval_df.columns:
112-
try:
113-
logger.info("generating GEOS5-FP minimum temperature in Kelvin")
114-
Tmin_K.append(np.nanmedian(GEOS5FP_connection.Tmin_K(time_UTC=time_UTC, geometry=geometry)))
115-
except Exception as e:
116-
Tmin_K.append(np.nan)
117-
logger.exception(e)
118-
119-
if not "SZA" in FLiES_inputs_from_calval_df.columns:
120-
try:
121-
logger.info("calculating solar zenith angle")
122-
SZA.append(calculate_SZA_from_datetime(time_UTC, lat, lon))
123-
except Exception as e:
124-
SZA.append(np.nan)
125-
logger.exception(e)
126-
127-
if not "KG" in FLiES_inputs_from_calval_df.columns:
128-
try:
129-
logger.info("selecting Koppen Geiger climate classification")
130-
KG.append(load_koppen_geiger(geometry=geometry)[1, 1][0][0])
131-
except Exception as e:
132-
KG.append(np.nan)
133-
logger.exception(e)
134-
135-
if not "doy" in FLiES_inputs_df.columns:
136-
FLiES_inputs_df["doy"] = doy
137-
138-
if not "AOT" in FLiES_inputs_df.columns:
139-
FLiES_inputs_df["AOT"] = AOT
140-
141-
if not "COT" in FLiES_inputs_df.columns:
142-
FLiES_inputs_df["COT"] = COT
143-
144-
if not "vapor_gccm" in FLiES_inputs_df.columns:
145-
FLiES_inputs_df["vapor_gccm"] = vapor_gccm
146-
147-
if not "ozone_cm" in FLiES_inputs_df.columns:
148-
FLiES_inputs_df["ozone_cm"] = ozone_cm
149-
150-
FLiES_inputs_df["SWin"] = SWin
151-
FLiES_inputs_df["Tmin_K"] = Tmin_K
21+
Generates a DataFrame of FLiES inputs by retrieving atmospheric and static data.
15222
153-
if not "SZA" in FLiES_inputs_df.columns:
154-
FLiES_inputs_df["SZA"] = SZA
23+
This is a simple wrapper around retrieve_FLiESANN_inputs that handles DataFrame
24+
input/output and geometry parsing.
25+
26+
Parameters:
27+
input_df (pd.DataFrame): A DataFrame containing the following columns:
28+
- time_UTC (str or datetime): Time in UTC.
29+
- geometry (str or shapely.geometry.Point) or (lat, lon): Spatial coordinates.
30+
If "geometry" is a string, it should be in WKT format (e.g., "POINT (lon lat)").
31+
- albedo (float, optional): Surface albedo.
32+
- COT (float, optional): Cloud optical thickness.
33+
- AOT (float, optional): Aerosol optical thickness.
34+
- vapor_gccm (float, optional): Water vapor in grams per cubic centimeter.
35+
- ozone_cm (float, optional): Ozone concentration in centimeters.
36+
- elevation_m (float, optional): Elevation in meters.
37+
- SZA_deg (float, optional): Solar zenith angle in degrees.
38+
- KG or KG_climate (str, optional): Köppen-Geiger climate classification.
39+
- SWin_Wm2 (float, optional): Shortwave incoming solar radiation.
40+
- day_of_year (float, optional): Day of year.
41+
- hour_of_day (float, optional): Hour of day.
42+
GEOS5FP_connection (GEOS5FP, optional): Connection object for GEOS-5 FP data.
43+
NASADEM_connection (NASADEMConnection, optional): Connection object for NASADEM data.
44+
45+
Returns:
46+
pd.DataFrame: A DataFrame with the same structure as the input, but with additional columns:
47+
- albedo: Surface albedo array
48+
- COT: Cloud optical thickness array
49+
- AOT: Aerosol optical thickness array
50+
- vapor_gccm: Water vapor array
51+
- ozone_cm: Ozone concentration array
52+
- elevation_m: Elevation in meters array
53+
- elevation_km: Elevation in kilometers array
54+
- KG_climate: Climate classification array
55+
- SZA_deg: Solar zenith angle array
56+
- SWin_Wm2: Shortwave incoming radiation array
57+
- day_of_year: Day of year array
58+
- atype: Aerosol type array
59+
- ctype: Cloud type array
60+
61+
Raises:
62+
KeyError: If required columns ("geometry" or "lat" and "lon") are missing.
63+
"""
64+
def ensure_geometry(row):
65+
if "geometry" in row:
66+
if isinstance(row.geometry, str):
67+
s = row.geometry.strip()
68+
if s.startswith("POINT"):
69+
coords = s.replace("POINT", "").replace("(", "").replace(")", "").strip().split()
70+
return Point(float(coords[0]), float(coords[1]))
71+
elif "," in s:
72+
coords = [float(c) for c in s.split(",")]
73+
return Point(coords[0], coords[1])
74+
else:
75+
coords = [float(c) for c in s.split()]
76+
return Point(coords[0], coords[1])
77+
return row.geometry
78+
79+
logger.info("started generating FLiES input table")
80+
81+
# Ensure geometry column is properly formatted
82+
input_df = input_df.copy()
83+
input_df["geometry"] = input_df.apply(ensure_geometry, axis=1)
84+
85+
# Prepare output DataFrame
86+
output_df = input_df.copy()
15587

156-
if not "KG" in FLiES_inputs_df.columns:
157-
FLiES_inputs_df["KG"] = KG
88+
# Prepare geometries
89+
if "geometry" in input_df.columns:
90+
geometries = MultiPoint([(geom.x, geom.y) for geom in input_df.geometry], crs=WGS84)
91+
elif "lat" in input_df.columns and "lon" in input_df.columns:
92+
geometries = MultiPoint([(lon, lat) for lon, lat in zip(input_df.lon, input_df.lat)], crs=WGS84)
93+
else:
94+
raise KeyError("Input DataFrame must contain either 'geometry' or both 'lat' and 'lon' columns.")
15895

159-
if "Ta" in FLiES_inputs_df and "Ta_C" not in FLiES_inputs_df:
160-
FLiES_inputs_df.rename({"Ta": "Ta_C"}, inplace=True)
96+
# Convert time column to datetime
97+
times_UTC = pd.to_datetime(input_df.time_UTC)
16198

162-
return FLiES_inputs_df
99+
logger.info(f"generating inputs for {len(input_df)} rows")
100+
101+
# Helper function to get column values or None if column doesn't exist
102+
def get_column_or_none(df, col_name, default_col_name=None):
103+
if col_name in df.columns:
104+
return df[col_name].values
105+
elif default_col_name and default_col_name in df.columns:
106+
return df[default_col_name].values
107+
else:
108+
return None
109+
110+
# Retrieve all inputs at once using vectorized retrieve_FLiESANN_inputs call
111+
FLiES_inputs = retrieve_FLiESANN_inputs(
112+
geometry=geometries,
113+
time_UTC=times_UTC,
114+
albedo=get_column_or_none(input_df, "albedo"),
115+
COT=get_column_or_none(input_df, "COT"),
116+
AOT=get_column_or_none(input_df, "AOT"),
117+
vapor_gccm=get_column_or_none(input_df, "vapor_gccm"),
118+
ozone_cm=get_column_or_none(input_df, "ozone_cm"),
119+
elevation_m=get_column_or_none(input_df, "elevation_m"),
120+
SZA_deg=get_column_or_none(input_df, "SZA_deg", "SZA"),
121+
KG_climate=get_column_or_none(input_df, "KG_climate", "KG"),
122+
SWin_Wm2=get_column_or_none(input_df, "SWin_Wm2"),
123+
day_of_year=get_column_or_none(input_df, "day_of_year"),
124+
hour_of_day=get_column_or_none(input_df, "hour_of_day"),
125+
GEOS5FP_connection=GEOS5FP_connection,
126+
NASADEM_connection=NASADEM_connection
127+
)
128+
129+
# Add retrieved inputs to the output DataFrame
130+
for key, values in FLiES_inputs.items():
131+
# Skip values with mismatched lengths
132+
if hasattr(values, '__len__') and not isinstance(values, str):
133+
if len(values) != len(output_df):
134+
continue
135+
output_df[key] = values
136+
137+
logger.info("completed generating FLiES input table")
138+
139+
return output_df

0 commit comments

Comments
 (0)