Merge pull request #129 from rowingdude/quick_fixes

rowingdude · web-flow · commit 41023b44bc28 · 2024-09-04T15:07:48.000-04:00
Fix MFT record storage and CSV writer initialization
diff --git a/analyzeMFT.py b/analyzeMFT.py
@@ -2,6 +2,9 @@
 import asyncio
 from src.analyzeMFT.cli import main
 
+# Adds the current directory to the path to ensure our file calls are consistent.
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
+
 if __name__ == "__main__":
     if sys.platform == "win32":
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
diff --git a/setup.py b/setup.py
@@ -1,14 +1,16 @@
 from setuptools import setup, find_packages
+from src.analyzeMFT.constants import VERSION
 
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 
 setup(
     name='analyzeMFT',
-    version='3.0',
+    version=VERSION,
     author='Benjamin Cance',
     author_email='bjc@tdx.li',
-    packages=find_packages(),
+    package_dir={'': 'src'},
+    packages=find_packages(where='src'),
     url='http://github.com/rowingdude/analyzeMFT',
     license='LICENSE.txt',
     description='Analyze the $MFT from a NTFS filesystem.',
diff --git a/src/analyzeMFT/cli.py b/src/analyzeMFT/cli.py
@@ -29,8 +29,13 @@ async def main():
                             help="Export as log2timeline CSV")
     parser.add_option_group(export_group)
 
-    parser.add_option("-d", "--debug", action="store_true", dest="debug",
-                      help="Enable debug output", default=False)
+    verbosity_group = OptionGroup(parser, "Verbosity Options")
+    verbosity_group.add_option("-v", action="count", dest="verbosity",
+                               help="Increase output verbosity (can be used multiple times)", default=0)
+    verbosity_group.add_option("-d", action="count", dest="debug",
+                               help="Increase debug output (can be used multiple times)", default=0)
+    parser.add_option_group(verbosity_group)
+
     parser.add_option("-H", "--hash", action="store_true", dest="compute_hashes",
                       help="Compute hashes (MD5, SHA256, SHA512, CRC32)", default=False)
 
@@ -46,8 +51,15 @@ async def main():
         print("\nError: No output file specified. Use -o or --output to specify an output file.")
         sys.exit(1)
 
+    # Default to CSV if no format specified
     if not options.export_format:
-        options.export_format = "csv"  # Default to CSV if no format specified
+        options.export_format = "csv"  
+
+
+    analyzer = MftAnalyzer(options.filename, options.output_file, options.debug, options.very_debug, 
+                           options.verbosity, options.compute_hashes, options.export_format)
+    await analyzer.analyze()
+    print(f"Analysis complete. Results written to {options.output_file}")
 
     try:
         analyzer = MftAnalyzer(options.filename, options.output_file, options.debug, options.compute_hashes, options.export_format)
@@ -65,6 +77,7 @@ async def main():
             import traceback
             traceback.print_exc()
         sys.exit(1)
+ master
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/src/analyzeMFT/file_writers.py b/src/analyzeMFT/file_writers.py
@@ -4,6 +4,7 @@
 import asyncio
 from typing import List, Dict, Any
 from .mft_record import MftRecord
+from .constants import *
 
 class FileWriters:
     @staticmethod
diff --git a/src/analyzeMFT/mft_analyzer.py b/src/analyzeMFT/mft_analyzer.py
@@ -6,19 +6,27 @@
 from typing import Dict, Set, List, Optional, Any
 from .constants import *
 from .mft_record import MftRecord
+from .file_writers import FileWriters
 
 class MftAnalyzer:
 
-    def __init__(self, mft_file: str, output_file: str, debug: bool = False, compute_hashes: bool = False, export_format: str = "csv") -> None:
+    def __init__(self, mft_file: str, output_file: str, debug: bool = False, very_debug: bool = False, 
+                 verbosity: int = 0, compute_hashes: bool = False, export_format: str = "csv") -> None:
         self.mft_file = mft_file
         self.output_file = output_file
         self.debug = debug
+        self.very_debug = very_debug
+        self.verbosity = verbosity
         self.compute_hashes = compute_hashes
         self.export_format = export_format
-        self.mft_records = []
+        self.compute_hashes = compute_hashes
+        self.export_format = export_format
+        self.mft_records = {}  
         self.interrupt_flag = asyncio.Event()
-        self.csv_writer = None
+        
         self.csvfile = None
+        self.csv_writer = None
+
         self.stats = {
             'total_records': 0,
             'active_records': 0,
@@ -33,20 +41,27 @@ def __init__(self, mft_file: str, output_file: str, debug: bool = False, compute
                 'unique_crc32': set(),
             })
 
+    def log(self, message: str, level: int = 0):
+        if level <= self.debug or level <= self.verbosity:
+            print(message)
 
     async def analyze(self) -> None:
         try:
+            self.initialize_csv_writer()
             await self.process_mft()
             await self.write_output()
         except Exception as e:
             print(f"An unexpected error occurred: {e}")
             if self.debug:
                 traceback.print_exc()
         finally:
+            if self.csvfile:
+                self.csvfile.close()
             self.print_statistics()
 
 
     async def process_mft(self) -> None:
+        self.log(f"Processing MFT file: {self.mft_file}", 1)
         try:
             with open(self.mft_file, 'rb') as f:
                 while not self.interrupt_flag.is_set():
@@ -56,41 +71,40 @@ async def process_mft(self) -> None:
 
                     try:
                         record = MftRecord(raw_record, self.compute_hashes)
-                        
                         self.stats['total_records'] += 1
+                        
                         if record.flags & FILE_RECORD_IN_USE:
                             self.stats['active_records'] += 1
                         if record.flags & FILE_RECORD_IS_DIRECTORY:
                             self.stats['directories'] += 1
                         else:
                             self.stats['files'] += 1
 
-                        if self.compute_hashes:
-                            self.stats['unique_md5'].add(record.md5)
-                            self.stats['unique_sha256'].add(record.sha256)
-                            self.stats['unique_sha512'].add(record.sha512)
-                            self.stats['unique_crc32'].add(record.crc32)
-
-                        if self.debug:
-                            print(f"Processing record {self.stats['total_records']}: {record.filename}")
-
                         self.mft_records[record.recordnum] = record
 
-                        # Write to CSV in blocks of 1000 records
+                        if self.debug >= 2:
+                            self.log(f"Processed record {self.stats['total_records']}: {record.filename}", 2)
+                        elif self.stats['total_records'] % 10000 == 0:
+                            self.log(f"Processed {self.stats['total_records']} records...", 1)
+
                         if self.stats['total_records'] % 1000 == 0:
                             await self.write_csv_block()
-                            self.mft_records.clear()  # Clear processed records to save memory
+                            self.mft_records.clear()
 
                     except Exception as e:
-                        if self.debug:
-                            print(f"Error processing record {self.stats['total_records']}: {str(e)}")
+                        self.log(f"Error processing record {self.stats['total_records']}: {str(e)}", 1)
+                        if self.debug >= 2:
+                            traceback.print_exc()
                         continue
 
         except Exception as e:
-            print(f"Error reading MFT file: {str(e)}")
-            if self.debug:
+            self.log(f"Error reading MFT file: {str(e)}", 0)
+            if self.debug >= 1:
                 traceback.print_exc()
 
+        self.log(f"MFT processing complete. Total records processed: {self.stats['total_records']}", 0)
+
+
     def handle_interrupt(self) -> None:
         if sys.platform == "win32":
             # Windows-specific interrupt handling
@@ -111,26 +125,40 @@ def unix_handler():
                     getattr(signal, signame),
                     unix_handler)
 
+    def initialize_csv_writer(self):
+        if self.csvfile is None:
+            self.csvfile = open(self.output_file, 'w', newline='', encoding='utf-8')
+            self.csv_writer = csv.writer(self.csvfile)
+            self.csv_writer.writerow(CSV_HEADER)
+
     async def write_csv_block(self) -> None:
+        self.log(f"Writing CSV block. Records in block: {len(self.mft_records)}", 2)
         try:
+            if self.csv_writer is None:
+                self.initialize_csv_writer()
+            
             for record in self.mft_records.values():
-                filepath = self.build_filepath(record)
-                csv_row = record.to_csv()
-                csv_row[-1] = filepath  # Replace the filepath placeholder
-
-                csv_row = [str(item) for item in csv_row]
-                
                 try:
-                    self.csv_writer.writerow(csv_row)
-                except UnicodeEncodeError as e:
-                    print(f"Error writing record {record.recordnum}: {str(e)}")
-                    self.csv_writer.writerow([item.encode('utf-8', errors='replace').decode('utf-8') for item in csv_row])
-
-            await asyncio.sleep(0)  # Yield control to allow other tasks to run
+                    filepath = self.build_filepath(record)
+                    csv_row = record.to_csv()
+                    csv_row[-1] = filepath
 
+                    csv_row = [str(item) for item in csv_row]
+                    
+                    self.csv_writer.writerow(csv_row)
+                    if self.very_debug:
+                        self.log(f"Wrote record {record.recordnum} to CSV", 2)
+                except Exception as e:
+                    self.log(f"Error writing record {record.recordnum}: {str(e)}", 1)
+                    if self.very_debug:
+                        traceback.print_exc()
+
+            if self.csvfile:
+                self.csvfile.flush()
+            self.log(f"CSV block written. Current file size: {self.csvfile.tell() if self.csvfile else 0} bytes", 2)
         except Exception as e:
-            print(f"Error writing CSV block: {str(e)}")
-            if self.debug:
+            self.log(f"Error in write_csv_block: {str(e)}", 0)
+            if self.debug or self.very_debug:
                 traceback.print_exc()
 
 
@@ -184,13 +212,14 @@ def print_statistics(self) -> None:
 
 
     async def write_output(self) -> None:
+        print(f"Writing output in {self.export_format} format to {self.output_file}")
         if self.export_format == "csv":
-            await FileWriters.write_csv(self.mft_records, self.output_file)
+            await self.write_remaining_records()
         elif self.export_format == "json":
-            await FileWriters.write_json(self.mft_records, self.output_file)
+            await FileWriters.write_json(list(self.mft_records.values()), self.output_file)
         elif self.export_format == "xml":
-            await FileWriters.write_xml(self.mft_records, self.output_file)
+            await FileWriters.write_xml(list(self.mft_records.values()), self.output_file)
         elif self.export_format == "excel":
-            await FileWriters.write_excel(self.mft_records, self.output_file)
+            await FileWriters.write_excel(list(self.mft_records.values()), self.output_file)
         else:
             print(f"Unsupported export format: {self.export_format}")
diff --git a/src/analyzeMFT/mft_record.py b/src/analyzeMFT/mft_record.py
@@ -4,7 +4,9 @@
 import zlib
 from .constants import *
 from .windows_time import WindowsTime
-from typing import Dict, Set, List, Optional, Any,Union
+
+from typing import Dict, Set, List, Optional, Any, Union
+
 
 
 class MftRecord:
@@ -79,11 +81,11 @@ def parse_record(self) -> None:
             self.base_ref = struct.unpack("<Q", self.raw_record[MFT_RECORD_FILE_REFERENCE_OFFSET:MFT_RECORD_FILE_REFERENCE_OFFSET+MFT_RECORD_FILE_REFERENCE_SIZE])[0]
             self.next_attrid = struct.unpack("<H", self.raw_record[MFT_RECORD_NEXT_ATTRIBUTE_ID_OFFSET:MFT_RECORD_NEXT_ATTRIBUTE_ID_OFFSET+MFT_RECORD_NEXT_ATTRIBUTE_ID_SIZE])[0]
             self.recordnum = struct.unpack("<I", self.raw_record[MFT_RECORD_RECORD_NUMBER_OFFSET:MFT_RECORD_RECORD_NUMBER_OFFSET+MFT_RECORD_RECORD_NUMBER_SIZE])[0]
+            self.parse_attributes()
+
         except struct.error:
-            if self.debug:
+            if hasattr(self, 'debug') and self.debug:
                 print(f"Error parsing MFT record header for record {self.recordnum}")
-        
-        self.parse_attributes()
 
     def parse_attributes(self) -> None:
         offset = self.attr_off
@@ -104,7 +106,7 @@ def parse_attributes(self) -> None:
                 elif attr_type == ATTRIBUTE_LIST_ATTRIBUTE:
                     self.parse_attribute_list(offset)
                 elif attr_type == OBJECT_ID_ATTRIBUTE:
-                    self.parse_object_id(offset)
+                    self.parse_object_id_attribute(offset)
                 elif attr_type == SECURITY_DESCRIPTOR_ATTRIBUTE:
                     self.parse_security_descriptor(offset)
                 elif attr_type == VOLUME_NAME_ATTRIBUTE: