-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrmn_spatial_template_reader.py
336 lines (246 loc) · 12 KB
/
rmn_spatial_template_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
"""
This script reads the excel spreadsheet from RMS
and covert each sheet into a dataframe.
Then remap the columns names and check for invalid records.
Finally, prepares the df to be pushed to the database.
Author: Javier Soto
Date: 06/01/2024
Version: V1.0
"""
## IMPORT LIBRARIES
import os
import sys
import argparse
import pandas as pd
import geopandas as gpd
from datetime import datetime
import psycopg2
import numpy as np
import psycopg2.extras as extras
from shapely import wkt
from shapely import Polygon, LineString, Point, MultiPolygon
from config import *
###################################################################################
###################################################################################
###################################################################################
parser = argparse.ArgumentParser(description="Process input variables for the script.")
# example python rmn_excel_template_reader.py -r MS01 -g 502418 -v 1-year "C:\javi\repos\rmn\data\MS01 - post\DONE-PA - Monitoring - RMN survey - 202324 - Post-restoration - MS01 Ben Lawyers - 1-year excel - 08 Oct 2024 (A4770282).xlsx"
parser.add_argument('-r', '--rmn_id', type=str, required=True, help="The RMN ID")
parser.add_argument('-g', '--grant_id', type=str, required=True, help="The The Grant ID")
parser.add_argument('-v', '--visit', type=str, required=True, help="The Visit")
parser.add_argument('path', type=str, help="The file path, just paste it")
args = parser.parse_args()
rmn_id = args.rmn_id
grant_id = args.grant_id
visit = args.visit
file_path = args.path
if not os.path.exists(file_path):
print(f" Error: The file at '{file_path}' does not exist")
sys.exit(1)
###################################################################################
###################################################################################
###################################################################################
pd.options.mode.chained_assignment = None # default='warn'
## DB connection credentials:
conn = psycopg2.connect(
database=dbname, user=username, password=password, host=host, port=port
)
def check_values_not_exist(conn, tables, columns, values):
"""
Check if the combination of values does not exist in the specified tables and columns.
Parameters:
conn: psycopg2 connection object
tables (list): List of table names to query.
columns (list): List of column names in the order [rmn_id, grant_id, visit].
values (list): List of values to check in the order [rmn_id, grant_id, visit].
Returns:
bool: True if the values do not exist in any table, False otherwise.
"""
if len(columns) != 3 or len(values) != 3:
raise ValueError("Columns and values must each contain exactly 3 items: [rmn_id, grant_id, visit].")
rmn_id, grant_id, visit = columns
value_rmn_id, value_grant_id, value_visit = values
# List to store tables where values do not exists
tables_where_values_not_exist = []
try:
with conn.cursor() as cur:
for table in tables:
# Build the query dynamically
query = (
f"SELECT EXISTS ("
f" SELECT 1 FROM pa_restoration_monitoring_network.{table}"
f" WHERE {rmn_id} = %s AND {grant_id} = %s AND {visit} = %s"
f")")
# Execute the query
cur.execute(query, (value_rmn_id, value_grant_id, value_visit))
# Fetch the result (True if exists, False otherwise)
result = cur.fetchone()[0]
# Add table to the list if the values do not exist
if not result:
#print(f"\n....The table {table} will be added to the list....\n")
tables_where_values_not_exist.append(table)
if result:
print(f"\n....The table {table} already has a survey in it with the same {grant_id}, {rmn_id} and {visit}. Skipping....\n")
# If no matches were found in any table, return list of tables
return tables_where_values_not_exist
except Exception as e:
print(f"Error occurred: {e}")
return None
def execute_values(conn, df, table):
"""
https://www.geeksforgeeks.org/how-to-insert-a-pandas-dataframe-to-an-existing-postgresql-table/
"""
tuples = [tuple(x) for x in df.to_numpy()]
cols = ','.join(list(df.columns))
#print("table", table)
#print("cols", cols)
# SQL query to execute
query = f"INSERT INTO %s(%s) VALUES %%s" % (table, cols)
print(query)
cursor = conn.cursor()
try:
extras.execute_values(cursor, query, tuples)
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
cursor.close()
return 1
print("\n....The dataframe is inserted....\n")
cursor.close()
def multipolygon_to_polygon(df):
df = df.explode(index_parts=False, inplace=True)
return df
## SCRIPT SETTINGS
pd.set_option('display.max_columns', None) # print all column names
## READ EXCEL MAP
map_df = pd.read_excel(r'map\RMN data for database.xlsx')
#print(map_df)
##FILTER MAP BY COLUMS UPLOAD TO DB
map_df = map_df[map_df['Upload to DB'] == 'Yes'] # keep only columns where Upload to DB is YES
#print(map_df)
## CREATE LIST OF SHEETS OF INTEREST
sheets_of_interest = map_df['Tab or geopackage layer'].unique() ## get the list from mapdataframe
tables = map_df['Database layer'].unique() ## get the list for checker
## Parameters for cheker function
columns = ['rmn_id','grant_id','visit']
values = [rmn_id,grant_id,visit]
## Tables to push data to the database
tables_to_push = check_values_not_exist(conn, tables, columns, values)
# function to rename the sampling points
def modify_sampling_point(value):
parts = value.rsplit("_", 1) # get the last part of the string, split from right
if len(parts) == 2 and parts[-1].isdigit(): # makes sure valid split and numeric last part
if len(parts[-1]) == 1: # check if the last bit is 1,2,3,4,5,6,7,8,9
parts[-1] = f"0{parts[-1]}" # add the 0
return "_".join(parts) # join back the modified value
## function to remap sheets and export csv:
def remap_and_export(s,df,rmn_id,grant_id,visit,conn, tables_to_push):
print("\n\n 3 WITHIN THE REMAP AND EXPORT FUNCTION. S: ", s)
try:
# filter rempa_df based on the layer name
remap_df = map_df[map_df['Tab or geopackage layer'] == s]
print("remap_df\n\n", remap_df)
# convert column names to a dictionary
rename_mapping = dict(zip(remap_df['Field name'], remap_df['Field name for DB']))
print("Rename_mapping: ", rename_mapping)
except Exception as e:
print("Issue when preparing dictionary for mapping: ", e)
## Add missing fields to the dataframe (updates over the versions)
if s == 'Photos':
#print("INSERTING DAMS LINK: ", s)
df['dams_link'] = df.get('dams_link', pd.NA)
if s == 'Area-level assessment':
df['other_damage_notes'] = df.get('other_damage_notes', pd.NA)
# renema colums in the daframe using remap dataframe
try:
df.rename(columns=rename_mapping,inplace=True)
except Exception as e:
print("Issue when renaming columns: ", e)
# fix data times types in any dataframe
if 'date' in df.columns:
df['date'] = pd.to_datetime(df['date'])
## keep the highest year
if s == 'sampling_point':
print("MAX0")
df['year'] = df['date'].dt.year # Extract year from the date column
print(df['year'].nunique()) # Print number of unique years
print("MAX1", df.info()) # Print DataFrame info
if df['year'].nunique() == 1:
print("Heeeeelllooooowwww")
df.drop('year', axis=1, inplace=True) # Drop the year column
print("NO YEAR", df.head(1))
elif df['year'].nunique() > 1:
df = df[df['year'] == df['year'].max()] # Keep only rows with the highest year
print("MAX YEAR", df['year'].max())
df.drop(columns=['year'], axis=1, inplace=True) # Drop the year column after filtering
# keep only the columns from the remap dataframe
try:
print("Just about to remap: ", df)
df = df[list(rename_mapping.values())]
print("df that keeps ranamed lists columns only: ", df)
except Exception as e:
print("Issue when keeping the columns after remaping: ", e)
## add code project ID to every single table
df['rmn_id'] = rmn_id
df['grant_id'] = grant_id
df['visit'] = visit
# Replace all Nan or Nat on any table
df = df.replace('^\s*$', np.nan, regex=True) ## replaces any white spaces with NaN
df = df.replace({np.nan: None}) ## replaces any NaN or NaT with Null (for PostgreSQL)
#df - df.applymap(lambda x: None if pd.isna(x) else x)
print("\n\n 4 CLEAN : ",s)
## keeps only the two columns with layer names and removes all duplicates
column_remap_df = remap_df[['Tab or geopackage layer','Database layer']].drop_duplicates()
#print(df.info()) ## just for testing
df.to_csv(f'{s}_after_test.csv') ## just for testing
#print(column_remap_df) ## just for testing
#print("\n\n\n\n\n\n","df to the database: ", df, "\n\n\n\n\n\n") ## just for testing
if "sampling_point" in df.columns:
# Apply the transformation
df["sampling_point"] = df["sampling_point"].apply(modify_sampling_point)
if "drain_point" in df.columns:
# update the drain columns by adding underscore
df["drain_point"] = df["sampling_point"] + "_drain"
if column_remap_df['Database layer'].iloc[0] in tables_to_push:
print(f"\n....Pushing table {column_remap_df['Database layer'].iloc[0]}....\n")
## Push to DB the iloc method takes the shortest name of the table
execute_values(conn, df, "pa_restoration_monitoring_network."+column_remap_df['Database layer'].iloc[0])
## loop over each sheet from each excel survey
for s in sheets_of_interest:
print("\n\n\ 1 INDIVIDUAL SHEEETS FROM EXCEL ", s)
try:
print("\n 2 ....Preparing table: ",s,"\n")
df = gpd.read_file(file_path, layer=s)
if s == 'monitoring_area':
def to_multipolygon(geometry):
if isinstance(geometry, Polygon):
return MultiPolygon([geometry])
return geometry
df['geomery'] = df['geometry'].apply(to_multipolygon)
df = df.dissolve()
if s == 'sampling_point':
df = df[df['source'] == 'field'].reset_index(drop=True)
df['geometry'] = df['geometry'].apply(lambda point: Point(point.x, point.y))
if s == 'drain_points':
df['geometry'] = df['geometry'].apply(lambda point: Point(point.x, point.y))
df = df.dropna(axis=0, how='all') # delete all rows with nulls
df = df.dropna(axis=1, how='all') # delete all columns with nulls
df = df.reset_index(drop=True)
df['geometry'] = df['geometry'].apply(lambda geom: geom.wkt) # convert all geometries to WKT, otherwise psycopg2 cant deal with it
# Check if the geometry is multipolygon, then convert it to polygon
df.to_csv(f'{s}_before_test.csv') ## just for testing
# get columns names from map file
remap_and_export(s,df,rmn_id,grant_id,visit,conn, tables_to_push)
check_values_not_exist(conn, tables, columns, values)
except ValueError:
print(f'{s}: Layer not found')
except KeyError as k:
print(f'{s}: Key Error {k}')
except TypeError:
print(f'{s}: Data Type Error')
except FileNotFoundError:
print(f'file_path: File not found')
except Exception as e:
print(f'Unexpected error occurred. Could be related with the sheets names in excel being different or not present: {e}')
## check for invalid and null records