kiwiz · zaccken · Mar 21, 2025
diff --git a/sms/mdc_repacker.py b/sms/mdc_repacker.py
@@ -0,0 +1,79 @@
+import xml.etree.ElementTree as ET
+import argparse
+
+def load_xml_mapping(xml_file):
+    """
+    Load the XML file (as produced by mdc_unpacker.py) and build a mapping
+    from the friendly marker (a decimal string) to the new text (as UTF-8 bytes).
+    """
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+    mapping = {}
+    for entry in root.findall("Entry"):
+        japanese_text = entry.find("JapaneseText")
+        if japanese_text is not None:
+            # The XML file uses the friendly marker as the 'hex' attribute (a decimal string)
+            friendly_marker = japanese_text.get("hex")
+            # The text we want to inject (English) is in the text content.
+            new_text = japanese_text.text or ""
+            mapping[friendly_marker] = new_text.encode("utf-8")
+    return mapping
+
+def rebuild_mdc(original_mdc, xml_file, output_mdc):
+    mapping = load_xml_mapping(xml_file)
+
+    with open(original_mdc, "rb") as f:
+        data = f.read()
+
+    records = []
+    index = 0
+    while index < len(data):
+        # Ensure there are at least 8 bytes left for a marker.
+        if index + 8 > len(data):
+            break
+        marker_bytes = data[index:index+8]
+        # Convert marker to a friendly decimal string.
+        # (Same as mdc_unpacker.py: convert_hex_marker())
+        marker_hex = marker_bytes.hex().upper()
+        friendly_marker = str(int(marker_hex, 16))
+        index += 8
+
+        # Find the end of the text (null terminated)
+        text_end = data.find(b'\x00', index)
+        if text_end == -1:
+            # If no null terminator is found, assume end of file.
+            text_end = len(data)
+        original_text = data[index:text_end]
+        index = text_end + 1  # Move index past the null terminator
+
+        # If we have a replacement for this marker, use it;
+        # otherwise, keep the original text.
+        if friendly_marker in mapping:
+            new_text = mapping[friendly_marker]
+        else:
+            new_text = original_text
+
+        # Build a new record: marker (8 bytes) + new text + null terminator.
+        record = marker_bytes + new_text + b'\x00'
+        records.append(record)
+
+    # Reassemble the file
+    new_data = b"".join(records)
+    with open(output_mdc, "wb") as f:
+        f.write(new_data)
+
+    print(f"Rebuilt MDC file with {len(records)} entries and saved to {output_mdc}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Rebuild an MDC file with English text injected from an XML file."
+    )
+    parser.add_argument("-s", "--source", type=str, required=True,
+                        help="Path to the original MDC file (source)")
+    parser.add_argument("-i", "--input", type=str, required=True,
+                        help="Path to the input XML file containing English text mappings")
+    parser.add_argument("-o", "--output", type=str, required=True,
+                        help="Path to the output rebuilt MDC file")
+    args = parser.parse_args()
+
+    rebuild_mdc(args.source, args.input, args.output)
diff --git a/sms/mdc_translator.py b/sms/mdc_translator.py
@@ -0,0 +1,86 @@
+# python mdc_translator.py --input=output/maildic_m.xml --output=results/maildic_m.xml
+# tested on 4090 GTX, can work on 3090 but you have to change madlad400-3b-mt to something less demanding
+
+import argparse
+import os
+import xml.etree.ElementTree as ET
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+
+def load_model_and_tokenizer(model_name):
+    model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
+    tokenizer = T5Tokenizer.from_pretrained(model_name)
+    return model, tokenizer
+
+def translate_text(text, model, tokenizer):
+    # Define a dictionary for symbols/phrases to preserve.
+    preserve_dict = {
+        "▲主人公＊▲": "<|HERO|>",
+        "▼": "<|DOWNARROW|>",
+        # Add other symbols/phrases here if needed.
+    }
+
+    # Replace each symbol with its unique placeholder.
+    text_for_translation = text
+    for symbol, placeholder in preserve_dict.items():
+        text_for_translation = text_for_translation.replace(symbol, placeholder)
+
+    # Prepend the translation prefix for Japanese-to-English translation.
+    text_to_translate = "<2en> " + text_for_translation
+
+    # Tokenize and generate the translation.
+    input_ids = tokenizer(text_to_translate, return_tensors="pt").input_ids.to(model.device)
+    outputs = model.generate(input_ids=input_ids, max_length=256, num_beams=1)
+    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    # Restore the original symbols from their placeholders.
+    final_translation = translation
+    for symbol, placeholder in preserve_dict.items():
+        final_translation = final_translation.replace(placeholder, symbol)
+
+    return final_translation
+
+def process_xml(input_file, output_file, model, tokenizer):
+    # Parse the XML file.
+    tree = ET.parse(input_file)
+    root = tree.getroot()
+    entries = root.findall('Entry')
+    total = len(entries)
+    print(f"Found {total} entries in the XML file.")
+
+    # Process each <Entry> and update the <JapaneseText> element with the translation.
+    for i, entry in enumerate(entries, start=1):
+        japanese_elem = entry.find('JapaneseText')
+        if japanese_elem is not None and japanese_elem.text and japanese_elem.text.strip():
+            original_text = japanese_elem.text.strip()
+            translated = translate_text(original_text, model, tokenizer)
+            japanese_elem.text = translated
+            print(f"Processed entry {i}/{total}")
+
+    # Write out the modified XML tree.
+    if output_file:
+        if os.path.isdir(output_file):
+            output_file = os.path.join(output_file, "translated.xml")
+        tree.write(output_file, encoding="utf-8", xml_declaration=True)
+        print(f"Translated XML written to {output_file}")
+    else:
+        xml_str = ET.tostring(root, encoding="utf-8", method="xml").decode("utf-8")
+        print(xml_str)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Japanese to English XML Translator that preserves specific symbols"
+    )
+    parser.add_argument(
+        "--input", "-i", type=str, required=True,
+        help="Path to the input XML file"
+    )
+    parser.add_argument(
+        "--output", "-o", type=str, required=True,
+        help="Path to the output XML file or directory"
+    )
+    args = parser.parse_args()
+
+    model_name = 'jbochi/madlad400-3b-mt'
+    model, tokenizer = load_model_and_tokenizer(model_name)
+
+    process_xml(args.input, args.output, model, tokenizer)
diff --git a/sms/mdc_unpacker.py b/sms/mdc_unpacker.py
@@ -0,0 +1,66 @@
+# Usage: python mdc_unpacker.py to_convert/maildic_r.mdc output/maildic_r.xml
+# Tested on python-3.11.7 64bit
+
+import xml.etree.ElementTree as ET
+import argparse
+
+def convert_hex_marker(marker):
+    """ Convert hex marker into a more friendly label """
+    return str(int(marker, 16))
+
+def replace_invalid_chars(text):
+    """ Replace invalid XML characters with a placeholder """
+    return ''.join(c if ord(c) >= 0x20 or c in '\t\n\r' else '?' for c in text)
+
+def extract_dictionary_entries(file_path, output_xml="output.xml"):
+    entries = []
+
+    with open(file_path, "rb") as f:
+        file_data = f.read()
+
+    index = 0
+    while index < len(file_data):
+        # Extract marker (assuming the marker is 8 bytes long)
+        if index + 8 > len(file_data):
+            break
+        marker = file_data[index : index + 8].hex().upper()
+        friendly_marker = convert_hex_marker(marker)
+        index += 8
+
+        # Find the end of the Japanese text (assuming null-terminated UTF-8)
+        end_index = file_data.find(b'\x00', index)
+        if end_index == -1:
+            break
+
+        # Extract Japanese text
+        try:
+            text = file_data[index:end_index].decode("utf-8", errors="ignore")
+            text = replace_invalid_chars(text)  # Replace invalid characters
+        except UnicodeDecodeError:
+            text = "[ERROR: Cannot decode]"
+
+        # Store entry
+        entries.append((friendly_marker, text))
+
+        # Move index past the null terminator
+        index = end_index + 1
+
+    # Create XML structure
+    root = ET.Element("Dictionary")
+    for marker, text in entries:
+        entry = ET.SubElement(root, "Entry")
+        japanese_text = ET.SubElement(entry, "JapaneseText", hex=marker)
+        japanese_text.text = text
+
+    # Convert to XML string and save
+    tree = ET.ElementTree(root)
+    tree.write(output_xml, encoding="utf-8", xml_declaration=True)
+    print(f"Extracted {len(entries)} entries and saved to {output_xml}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract dictionary entries from a binary file and output to an XML file.")
+    parser.add_argument("input_file", help="Path to the input binary file")
+    parser.add_argument("output_file", help="Path to the output XML file")
+    args = parser.parse_args()
+
+    extract_dictionary_entries(args.input_file, args.output_file)