Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions sms/mdc_repacker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import xml.etree.ElementTree as ET
import argparse

def load_xml_mapping(xml_file):
"""
Load the XML file (as produced by mdc_unpacker.py) and build a mapping
from the friendly marker (a decimal string) to the new text (as UTF-8 bytes).
"""
tree = ET.parse(xml_file)
root = tree.getroot()
mapping = {}
for entry in root.findall("Entry"):
japanese_text = entry.find("JapaneseText")
if japanese_text is not None:
# The XML file uses the friendly marker as the 'hex' attribute (a decimal string)
friendly_marker = japanese_text.get("hex")
# The text we want to inject (English) is in the text content.
new_text = japanese_text.text or ""
mapping[friendly_marker] = new_text.encode("utf-8")
return mapping

def rebuild_mdc(original_mdc, xml_file, output_mdc):
mapping = load_xml_mapping(xml_file)

with open(original_mdc, "rb") as f:
data = f.read()

records = []
index = 0
while index < len(data):
# Ensure there are at least 8 bytes left for a marker.
if index + 8 > len(data):
break
marker_bytes = data[index:index+8]
# Convert marker to a friendly decimal string.
# (Same as mdc_unpacker.py: convert_hex_marker())
marker_hex = marker_bytes.hex().upper()
friendly_marker = str(int(marker_hex, 16))
index += 8

# Find the end of the text (null terminated)
text_end = data.find(b'\x00', index)
if text_end == -1:
# If no null terminator is found, assume end of file.
text_end = len(data)
original_text = data[index:text_end]
index = text_end + 1 # Move index past the null terminator

# If we have a replacement for this marker, use it;
# otherwise, keep the original text.
if friendly_marker in mapping:
new_text = mapping[friendly_marker]
else:
new_text = original_text

# Build a new record: marker (8 bytes) + new text + null terminator.
record = marker_bytes + new_text + b'\x00'
records.append(record)

# Reassemble the file
new_data = b"".join(records)
with open(output_mdc, "wb") as f:
f.write(new_data)

print(f"Rebuilt MDC file with {len(records)} entries and saved to {output_mdc}")

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Rebuild an MDC file with English text injected from an XML file."
)
parser.add_argument("-s", "--source", type=str, required=True,
help="Path to the original MDC file (source)")
parser.add_argument("-i", "--input", type=str, required=True,
help="Path to the input XML file containing English text mappings")
parser.add_argument("-o", "--output", type=str, required=True,
help="Path to the output rebuilt MDC file")
args = parser.parse_args()

rebuild_mdc(args.source, args.input, args.output)
86 changes: 86 additions & 0 deletions sms/mdc_translator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# python mdc_translator.py --input=output/maildic_m.xml --output=results/maildic_m.xml
# tested on 4090 GTX, can work on 3090 but you have to change madlad400-3b-mt to something less demanding

import argparse
import os
import xml.etree.ElementTree as ET
from transformers import T5ForConditionalGeneration, T5Tokenizer

def load_model_and_tokenizer(model_name):
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
tokenizer = T5Tokenizer.from_pretrained(model_name)
return model, tokenizer

def translate_text(text, model, tokenizer):
# Define a dictionary for symbols/phrases to preserve.
preserve_dict = {
"▲主人公*▲": "<|HERO|>",
"▼": "<|DOWNARROW|>",
# Add other symbols/phrases here if needed.
}

# Replace each symbol with its unique placeholder.
text_for_translation = text
for symbol, placeholder in preserve_dict.items():
text_for_translation = text_for_translation.replace(symbol, placeholder)

# Prepend the translation prefix for Japanese-to-English translation.
text_to_translate = "<2en> " + text_for_translation

# Tokenize and generate the translation.
input_ids = tokenizer(text_to_translate, return_tensors="pt").input_ids.to(model.device)
outputs = model.generate(input_ids=input_ids, max_length=256, num_beams=1)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Restore the original symbols from their placeholders.
final_translation = translation
for symbol, placeholder in preserve_dict.items():
final_translation = final_translation.replace(placeholder, symbol)

return final_translation

def process_xml(input_file, output_file, model, tokenizer):
# Parse the XML file.
tree = ET.parse(input_file)
root = tree.getroot()
entries = root.findall('Entry')
total = len(entries)
print(f"Found {total} entries in the XML file.")

# Process each <Entry> and update the <JapaneseText> element with the translation.
for i, entry in enumerate(entries, start=1):
japanese_elem = entry.find('JapaneseText')
if japanese_elem is not None and japanese_elem.text and japanese_elem.text.strip():
original_text = japanese_elem.text.strip()
translated = translate_text(original_text, model, tokenizer)
japanese_elem.text = translated
print(f"Processed entry {i}/{total}")

# Write out the modified XML tree.
if output_file:
if os.path.isdir(output_file):
output_file = os.path.join(output_file, "translated.xml")
tree.write(output_file, encoding="utf-8", xml_declaration=True)
print(f"Translated XML written to {output_file}")
else:
xml_str = ET.tostring(root, encoding="utf-8", method="xml").decode("utf-8")
print(xml_str)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Japanese to English XML Translator that preserves specific symbols"
)
parser.add_argument(
"--input", "-i", type=str, required=True,
help="Path to the input XML file"
)
parser.add_argument(
"--output", "-o", type=str, required=True,
help="Path to the output XML file or directory"
)
args = parser.parse_args()

model_name = 'jbochi/madlad400-3b-mt'
model, tokenizer = load_model_and_tokenizer(model_name)

process_xml(args.input, args.output, model, tokenizer)
66 changes: 66 additions & 0 deletions sms/mdc_unpacker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Usage: python mdc_unpacker.py to_convert/maildic_r.mdc output/maildic_r.xml
# Tested on python-3.11.7 64bit

import xml.etree.ElementTree as ET
import argparse

def convert_hex_marker(marker):
""" Convert hex marker into a more friendly label """
return str(int(marker, 16))

def replace_invalid_chars(text):
""" Replace invalid XML characters with a placeholder """
return ''.join(c if ord(c) >= 0x20 or c in '\t\n\r' else '?' for c in text)

def extract_dictionary_entries(file_path, output_xml="output.xml"):
entries = []

with open(file_path, "rb") as f:
file_data = f.read()

index = 0
while index < len(file_data):
# Extract marker (assuming the marker is 8 bytes long)
if index + 8 > len(file_data):
break
marker = file_data[index : index + 8].hex().upper()
friendly_marker = convert_hex_marker(marker)
index += 8

# Find the end of the Japanese text (assuming null-terminated UTF-8)
end_index = file_data.find(b'\x00', index)
if end_index == -1:
break

# Extract Japanese text
try:
text = file_data[index:end_index].decode("utf-8", errors="ignore")
text = replace_invalid_chars(text) # Replace invalid characters
except UnicodeDecodeError:
text = "[ERROR: Cannot decode]"

# Store entry
entries.append((friendly_marker, text))

# Move index past the null terminator
index = end_index + 1

# Create XML structure
root = ET.Element("Dictionary")
for marker, text in entries:
entry = ET.SubElement(root, "Entry")
japanese_text = ET.SubElement(entry, "JapaneseText", hex=marker)
japanese_text.text = text

# Convert to XML string and save
tree = ET.ElementTree(root)
tree.write(output_xml, encoding="utf-8", xml_declaration=True)
print(f"Extracted {len(entries)} entries and saved to {output_xml}")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract dictionary entries from a binary file and output to an XML file.")
parser.add_argument("input_file", help="Path to the input binary file")
parser.add_argument("output_file", help="Path to the output XML file")
args = parser.parse_args()

extract_dictionary_entries(args.input_file, args.output_file)
Loading