|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": null, |
| 5 | + "execution_count": 1, |
6 | 6 | "metadata": {},
|
7 |
| - "outputs": [], |
| 7 | + "outputs": [ |
| 8 | + { |
| 9 | + "name": "stderr", |
| 10 | + "output_type": "stream", |
| 11 | + "text": [ |
| 12 | + "2025-05-29 10:00:30,482 - INFO - Starting file processing...\n", |
| 13 | + "2025-05-29 10:00:30,482 - INFO - Starting file processing...\n", |
| 14 | + "2025-05-29 10:00:30,484 - INFO - Found 1 files to process\n", |
| 15 | + "2025-05-29 10:00:30,484 - INFO - Found 1 files to process\n", |
| 16 | + "2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n", |
| 17 | + "2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n", |
| 18 | + "2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n", |
| 19 | + "2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\\files\\Data Zone Lookup - Archived Geographies.csv\n", |
| 20 | + "2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n", |
| 21 | + "2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n", |
| 22 | + "2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n", |
| 23 | + "2025-05-29 10:00:30,533 - WARNING - Failed to read with ascii, trying fallback methods: 'ascii' codec can't decode byte 0xf2 in position 73819: ordinal not in range(128)\n", |
| 24 | + "2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n", |
| 25 | + "2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n", |
| 26 | + "2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n", |
| 27 | + "2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling\n", |
| 28 | + "2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 29 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 30 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 31 | + "2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 32 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 33 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 34 | + "2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 35 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 36 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 37 | + "2025-05-29 10:00:31,640 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 38 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 39 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 40 | + "2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n", |
| 41 | + "2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n", |
| 42 | + "2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n", |
| 43 | + "2025-05-29 10:00:32,980 - INFO - Dataset created: 2011_data_zone_archived_geographies\n", |
| 44 | + "2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 45 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 46 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 47 | + "2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 48 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 49 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 50 | + "2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 51 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 52 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 53 | + "2025-05-29 10:00:33,442 - ERROR - HTTP Error [404] Attempt 1/3\n", |
| 54 | + "URL: http://35.177.24.156:5000/api/3/action/package_show\n", |
| 55 | + "Response: {\"help\": \"https://cobaltadmin.sgdatacatalogue.net/api/3/action/help_show?name=package_show\", \"error\": {\"__type\": \"Not Found Error\", \"message\": \"Not found\"}, \"success\": false}\n", |
| 56 | + "2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n", |
| 57 | + "2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n", |
| 58 | + "2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n", |
| 59 | + "2025-05-29 10:00:35,780 - INFO - Resource created: Data Zone Lookup - Archived Geographies.csv\n", |
| 60 | + "2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n", |
| 61 | + "2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n", |
| 62 | + "2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n", |
| 63 | + "2025-05-29 10:00:35,812 - ERROR - Resource error: 'utf-8' codec can't decode byte 0xf2 in position 73819: invalid continuation byte\n", |
| 64 | + "2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n", |
| 65 | + "2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n", |
| 66 | + "2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n", |
| 67 | + "2025-05-29 10:00:35,817 - INFO - Report generated: ./Completed\\report\\report_20250529_100035.csv\n" |
| 68 | + ] |
| 69 | + } |
| 70 | + ], |
8 | 71 | "source": [
|
9 | 72 | "import os\n",
|
10 | 73 | "import pandas as pd\n",
|
|
26 | 89 | " )\n",
|
27 | 90 | " self.logger = setup_logging()\n",
|
28 | 91 | "\n",
|
| 92 | + " def _detect_encoding(self, file_path: str) -> str:\n", |
| 93 | + " \"\"\"Detect file encoding using chardet.\"\"\"\n", |
| 94 | + " try:\n", |
| 95 | + " with open(file_path, 'rb') as file:\n", |
| 96 | + " raw_data = file.read(100000) # Read first 100KB for detection\n", |
| 97 | + " result = chardet.detect(raw_data)\n", |
| 98 | + " encoding = result['encoding']\n", |
| 99 | + " confidence = result['confidence']\n", |
| 100 | + " \n", |
| 101 | + " self.logger.info(f\"Detected encoding: {encoding} (confidence: {confidence:.2f}) for {file_path}\")\n", |
| 102 | + " \n", |
| 103 | + " # If confidence is low, try common encodings\n", |
| 104 | + " if confidence < 0.7:\n", |
| 105 | + " self.logger.warning(f\"Low confidence in encoding detection. Trying fallback encodings.\")\n", |
| 106 | + " return self._try_fallback_encodings(file_path)\n", |
| 107 | + " \n", |
| 108 | + " return encoding\n", |
| 109 | + " except Exception as e:\n", |
| 110 | + " self.logger.error(f\"Error detecting encoding for {file_path}: {str(e)}\")\n", |
| 111 | + " return self._try_fallback_encodings(file_path)\n", |
| 112 | + "\n", |
| 113 | + " def _try_fallback_encodings(self, file_path: str) -> str:\n", |
| 114 | + " \"\"\"Try common encodings when detection fails or has low confidence.\"\"\"\n", |
| 115 | + " encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']\n", |
| 116 | + " \n", |
| 117 | + " for encoding in encodings_to_try:\n", |
| 118 | + " try:\n", |
| 119 | + " with open(file_path, 'r', encoding=encoding) as file:\n", |
| 120 | + " file.read(1000) # Try to read first 1000 characters\n", |
| 121 | + " self.logger.info(f\"Successfully validated encoding: {encoding} for {file_path}\")\n", |
| 122 | + " return encoding\n", |
| 123 | + " except UnicodeDecodeError:\n", |
| 124 | + " continue\n", |
| 125 | + " \n", |
| 126 | + " # If all else fails, use latin-1 (it can decode any byte sequence)\n", |
| 127 | + " self.logger.warning(f\"Using latin-1 as fallback encoding for {file_path}\")\n", |
| 128 | + " return 'latin-1'\n", |
| 129 | + "\n", |
| 130 | + " def _read_csv_with_encoding(self, file_path: str) -> pd.DataFrame:\n", |
| 131 | + " \"\"\"Read CSV file with proper encoding detection and handling.\"\"\"\n", |
| 132 | + " encoding = self._detect_encoding(file_path)\n", |
| 133 | + " \n", |
| 134 | + " try:\n", |
| 135 | + " # Try reading with detected encoding\n", |
| 136 | + " df = pd.read_csv(file_path, encoding=encoding)\n", |
| 137 | + " self.logger.info(f\"Successfully read CSV with {encoding} encoding\")\n", |
| 138 | + " return df\n", |
| 139 | + " except UnicodeDecodeError as e:\n", |
| 140 | + " self.logger.warning(f\"Failed to read with {encoding}, trying fallback methods: {str(e)}\")\n", |
| 141 | + " \n", |
| 142 | + " # Try with error handling\n", |
| 143 | + " for error_handling in ['ignore', 'replace']:\n", |
| 144 | + " try:\n", |
| 145 | + " df = pd.read_csv(file_path, encoding=encoding, encoding_errors=error_handling)\n", |
| 146 | + " self.logger.info(f\"Successfully read CSV with {encoding} encoding and {error_handling} error handling\")\n", |
| 147 | + " return df\n", |
| 148 | + " except Exception as e:\n", |
| 149 | + " continue\n", |
| 150 | + " \n", |
| 151 | + " # Last resort: try latin-1\n", |
| 152 | + " try:\n", |
| 153 | + " df = pd.read_csv(file_path, encoding='latin-1')\n", |
| 154 | + " self.logger.info(\"Successfully read CSV with latin-1 encoding as last resort\")\n", |
| 155 | + " return df\n", |
| 156 | + " except Exception as e:\n", |
| 157 | + " raise Exception(f\"Failed to read CSV with any encoding method: {str(e)}\")\n", |
| 158 | + "\n", |
29 | 159 | " def process_files(self) -> None:\n",
|
30 | 160 | " \"\"\"Main processing loop with error containment.\"\"\"\n",
|
31 | 161 | " files = self._get_pending_files()\n",
|
|
46 | 176 | " completed_file_path = os.path.join(self.config.completed_file_dir, filename)\n",
|
47 | 177 | "\n",
|
48 | 178 | " try:\n",
|
49 |
| - " # Step 1: Load data\n", |
| 179 | + " # Step 1: Load data with proper encoding handling\n", |
50 | 180 | " if filename.endswith('.csv'):\n",
|
51 |
| - " df = pd.read_csv(file_path)\n", |
| 181 | + " df = self._read_csv_with_encoding(file_path)\n", |
52 | 182 | " elif filename.endswith('.xlsx'):\n",
|
53 | 183 | " df = pd.read_excel(file_path)\n",
|
54 | 184 | " else:\n",
|
|
130 | 260 | " processor.process_files()\n",
|
131 | 261 | "\n",
|
132 | 262 | "if __name__ == \"__main__\":\n",
|
133 |
| - " run()\n" |
| 263 | + " run()" |
134 | 264 | ]
|
135 | 265 | }
|
136 | 266 | ],
|
|
0 commit comments