Skip to content

Commit ea5a1b5

Browse files
author
Sean Nugent
committed
Fix bulk load
1 parent 8be8e16 commit ea5a1b5

File tree

7 files changed

+485
-21
lines changed

7 files changed

+485
-21
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,6 @@ datastore_loader.log
3737
/automation/Pending/metadata/*
3838
/automation/__pycache__
3939
/automation/Functions/__pycache__
40-
/automation/Functions/__pycache__/*
40+
/automation/Functions/__pycache__/*
41+
*Archive
42+
*__pycache__/
Binary file not shown.
Binary file not shown.

automation/a_test_ckan_connection.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 4,
5+
"execution_count": 2,
66
"metadata": {},
77
"outputs": [
88
{

automation/b_generate_metadata_template.ipynb

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,13 @@
99
"name": "stdout",
1010
"output_type": "stream",
1111
"text": [
12-
"✅ Metadata template generated: Pending/metadata\\metadata_Carbon Footprint Breakdown_20250509.json\n",
13-
"✅ Metadata template generated: Pending/metadata\\metadata_Data Zone Lookup - Archived Geographies_20250509.json\n",
14-
"✅ Metadata template generated: Pending/metadata\\metadata_Deaths Involving Coronavirus (Covid-19)_20250509.json\n",
15-
"✅ Metadata template generated: Pending/metadata\\metadata_Gender Pay Gap_20250509.json\n",
16-
"✅ Metadata template generated: Pending/metadata\\metadata_Gross Domestic Product - Quarterly Output by Industry_20250509.json\n",
17-
"✅ Metadata template generated: Pending/metadata\\metadata_House Prices - Residential Properties, Sales and Price_20250509.json\n",
18-
"✅ Metadata template generated: Pending/metadata\\metadata_Life Expectancy_20250509.json\n",
19-
"✅ Metadata template generated: Pending/metadata\\metadata_Non-Domestic Energy Performance Certificates_20250509.json\n",
20-
"✅ Metadata template generated: Pending/metadata\\metadata_Police Officer Quarterly Strength_20250509.json\n",
21-
"✅ Metadata template generated: Pending/metadata\\metadata_Population Estimates - Young and Old_20250509.json\n",
22-
"✅ Metadata template generated: Pending/metadata\\metadata_Primary 1 Children Body Mass Index - Clinical_20250509.json\n",
23-
"✅ Metadata template generated: Pending/metadata\\metadata_Scottish Accommodation Occupancy_20250509.json\n",
24-
"✅ Metadata template generated: Pending/metadata\\metadata_Scottish Health Survey - Scotland Level Data by Sex_20250509.json\n"
12+
"✅ Metadata template generated: Pending/metadata\\metadata_Carbon Footprint Breakdown_20250529.json\n",
13+
"✅ Metadata template generated: Pending/metadata\\metadata_Data Zone Lookup - Archived Geographies_20250529.json\n",
14+
"✅ Metadata template generated: Pending/metadata\\metadata_House Prices - Residential Properties, Sales and Price_20250529.json\n",
15+
"✅ Metadata template generated: Pending/metadata\\metadata_Population Estimates - Young and Old_20250529.json\n",
16+
"✅ Metadata template generated: Pending/metadata\\metadata_Primary 1 Children Body Mass Index - Clinical_20250529.json\n",
17+
"✅ Metadata template generated: Pending/metadata\\metadata_Scottish Accommodation Occupancy_20250529.json\n",
18+
"✅ Metadata template generated: Pending/metadata\\metadata_Scottish Health Survey - Scotland Level Data by Sex_20250529.json\n"
2519
]
2620
}
2721
],
@@ -59,7 +53,8 @@
5953
" {\"key\": \"Relevance\", \"value\": \"\"},\n",
6054
" {\"key\": \"Timeliness and Punctuality\", \"value\": \"\"},\n",
6155
" {\"key\": \"Confidentiality Policy\", \"value\": \"\"},\n",
62-
" {\"key\": \"Revisions\", \"value\": \"\"}\n",
56+
" {\"key\": \"Revisions\", \"value\": \"\"},\n",
57+
" {\"key\": \"headline\", \"value\": \"\"}\n",
6358
" ]\n",
6459
" },\n",
6560
" \"resource\": {\n",

automation/c_load_files_to_ckan.ipynb

Lines changed: 135 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,72 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": null,
5+
"execution_count": 1,
66
"metadata": {},
7-
"outputs": [],
7+
"outputs": [
8+
{
9+
"name": "stderr",
10+
"output_type": "stream",
11+
"text": [
12+
"2025-05-29 10:00:30,482 - INFO - Starting file processing...\n",
13+
"2025-05-29 10:00:30,482 - INFO - Starting file processing...\n",
14+
"2025-05-29 10:00:30,484 - INFO - Found 1 files to process\n",
15+
"2025-05-29 10:00:30,484 - INFO - Found 1 files to process\n",
16+
"2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n",
17+
"2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n",
18+
"2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n",
19+
"2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n",
20+
"2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n",
21+
"2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n",
22+
"2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n",
23+
"2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n",
24+
"2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n",
25+
"2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n",
26+
"2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n",
27+
"2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n",
28+
"2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n",
29+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
30+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
31+
"2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n",
32+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
33+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
34+
"2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n",
35+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
36+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
37+
"2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n",
38+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
39+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
40+
"2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n",
41+
"2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n",
42+
"2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n",
43+
"2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n",
44+
"2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n",
45+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
46+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
47+
"2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n",
48+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
49+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
50+
"2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n",
51+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
52+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
53+
"2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n",
54+
"URL: http://35.177.24.156:5000/api/3/action/package_show\n",
55+
"Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n",
56+
"2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n",
57+
"2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n",
58+
"2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n",
59+
"2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n",
60+
"2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n",
61+
"2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n",
62+
"2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n",
63+
"2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n",
64+
"2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n",
65+
"2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n",
66+
"2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n",
67+
"2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n"
68+
]
69+
}
70+
],
871
"source": [
972
"import os\n",
1073
"import pandas as pd\n",
@@ -26,6 +89,73 @@
2689
" )\n",
2790
" self.logger = setup_logging()\n",
2891
"\n",
92+
" def _detect_encoding(self, file_path: str) -> str:\n",
93+
" \"\"\"Detect file encoding using chardet.\"\"\"\n",
94+
" try:\n",
95+
" with open(file_path, 'rb') as file:\n",
96+
" raw_data = file.read(100000) # Read first 100KB for detection\n",
97+
" result = chardet.detect(raw_data)\n",
98+
" encoding = result['encoding']\n",
99+
" confidence = result['confidence']\n",
100+
" \n",
101+
" self.logger.info(f\"Detected encoding: {encoding} (confidence: {confidence:.2f}) for {file_path}\")\n",
102+
" \n",
103+
" # If confidence is low, try common encodings\n",
104+
" if confidence < 0.7:\n",
105+
" self.logger.warning(f\"Low confidence in encoding detection. Trying fallback encodings.\")\n",
106+
" return self._try_fallback_encodings(file_path)\n",
107+
" \n",
108+
" return encoding\n",
109+
" except Exception as e:\n",
110+
" self.logger.error(f\"Error detecting encoding for {file_path}: {str(e)}\")\n",
111+
" return self._try_fallback_encodings(file_path)\n",
112+
"\n",
113+
" def _try_fallback_encodings(self, file_path: str) -> str:\n",
114+
" \"\"\"Try common encodings when detection fails or has low confidence.\"\"\"\n",
115+
" encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']\n",
116+
" \n",
117+
" for encoding in encodings_to_try:\n",
118+
" try:\n",
119+
" with open(file_path, 'r', encoding=encoding) as file:\n",
120+
" file.read(1000) # Try to read first 1000 characters\n",
121+
" self.logger.info(f\"Successfully validated encoding: {encoding} for {file_path}\")\n",
122+
" return encoding\n",
123+
" except UnicodeDecodeError:\n",
124+
" continue\n",
125+
" \n",
126+
" # If all else fails, use latin-1 (it can decode any byte sequence)\n",
127+
" self.logger.warning(f\"Using latin-1 as fallback encoding for {file_path}\")\n",
128+
" return 'latin-1'\n",
129+
"\n",
130+
" def _read_csv_with_encoding(self, file_path: str) -> pd.DataFrame:\n",
131+
" \"\"\"Read CSV file with proper encoding detection and handling.\"\"\"\n",
132+
" encoding = self._detect_encoding(file_path)\n",
133+
" \n",
134+
" try:\n",
135+
" # Try reading with detected encoding\n",
136+
" df = pd.read_csv(file_path, encoding=encoding)\n",
137+
" self.logger.info(f\"Successfully read CSV with {encoding} encoding\")\n",
138+
" return df\n",
139+
" except UnicodeDecodeError as e:\n",
140+
" self.logger.warning(f\"Failed to read with {encoding}, trying fallback methods: {str(e)}\")\n",
141+
" \n",
142+
" # Try with error handling\n",
143+
" for error_handling in ['ignore', 'replace']:\n",
144+
" try:\n",
145+
" df = pd.read_csv(file_path, encoding=encoding, encoding_errors=error_handling)\n",
146+
" self.logger.info(f\"Successfully read CSV with {encoding} encoding and {error_handling} error handling\")\n",
147+
" return df\n",
148+
" except Exception as e:\n",
149+
" continue\n",
150+
" \n",
151+
" # Last resort: try latin-1\n",
152+
" try:\n",
153+
" df = pd.read_csv(file_path, encoding='latin-1')\n",
154+
" self.logger.info(\"Successfully read CSV with latin-1 encoding as last resort\")\n",
155+
" return df\n",
156+
" except Exception as e:\n",
157+
" raise Exception(f\"Failed to read CSV with any encoding method: {str(e)}\")\n",
158+
"\n",
29159
" def process_files(self) -> None:\n",
30160
" \"\"\"Main processing loop with error containment.\"\"\"\n",
31161
" files = self._get_pending_files()\n",
@@ -46,9 +176,9 @@
46176
" completed_file_path = os.path.join(self.config.completed_file_dir, filename)\n",
47177
"\n",
48178
" try:\n",
49-
" # Step 1: Load data\n",
179+
" # Step 1: Load data with proper encoding handling\n",
50180
" if filename.endswith('.csv'):\n",
51-
" df = pd.read_csv(file_path)\n",
181+
" df = self._read_csv_with_encoding(file_path)\n",
52182
" elif filename.endswith('.xlsx'):\n",
53183
" df = pd.read_excel(file_path)\n",
54184
" else:\n",
@@ -130,7 +260,7 @@
130260
" processor.process_files()\n",
131261
"\n",
132262
"if __name__ == \"__main__\":\n",
133-
" run()\n"
263+
" run()"
134264
]
135265
}
136266
],

0 commit comments

Comments
 (0)