-
Notifications
You must be signed in to change notification settings - Fork 482
Implement Custom UTF-8 Decoder #885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 7 commits
cfeb127
e083376
4a54532
775f1ce
18e6080
6d8e314
8515899
51525ae
1f5f3eb
3d1093a
a5e46ae
3105843
7481274
60b3ca6
272770d
770955c
a354b30
960f2c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. | ||
import sys | ||
import logging | ||
import pathlib | ||
import argparse | ||
from typing import List, Tuple, Iterable, Optional | ||
|
||
import pefile | ||
|
||
MIN_STR_LEN = 4 | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_rdata_section(pe: pefile.PE) -> pefile.SectionStructure: | ||
for section in pe.sections: | ||
if section.Name.startswith(b".rdata\x00"): | ||
return section | ||
|
||
raise ValueError("no .rdata section found") | ||
|
||
|
||
def extract_utf8_strings(pe: pefile.PE, min_length=MIN_STR_LEN) -> List[Tuple[str, int, int]]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a function that's signature is analog to |
||
""" | ||
Extracts UTF-8 strings from the .rdata section of a PE file. | ||
""" | ||
try: | ||
rdata_section = get_rdata_section(pe) | ||
except ValueError as e: | ||
print("cannot extract rust strings: %s", e) | ||
return [] | ||
|
||
strings = rdata_section.get_data() | ||
|
||
character_and_index = [] | ||
|
||
# Reference: https://en.wikipedia.org/wiki/UTF-8 | ||
|
||
for i in range(0, len(strings)): | ||
# for 1 byte | ||
if strings[i] & 0x80 == 0x00: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add tests for various encoded strings to show they're properly decoded |
||
character = strings[i].to_bytes(1, "big").decode("utf-8", "ignore") | ||
character_and_index.append([character, i, 1]) | ||
Arker123 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# for 2 bytes | ||
elif strings[i] & 0xE0 == 0xC0: | ||
temp = strings[i] << 8 | strings[i + 1] | ||
character = temp.to_bytes(2, "big").decode("utf-8", "ignore") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you use I assume that your algorithm works pretty well, since you've opened the PR, but I can't quite follow how it works. Would you please add some comments explaining the design, and definitely a few test cases that exercise each of the branch arms? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, the tests for each branch are in |
||
i += 1 | ||
character_and_index.append([character, i, 2]) | ||
|
||
# for 3 bytes | ||
elif strings[i] & 0xF0 == 0xE0: | ||
temp = strings[i] << 16 | strings[i + 1] << 8 | strings[i + 2] | ||
character = temp.to_bytes(3, "big").decode("utf-8", "ignore") | ||
i += 2 | ||
character_and_index.append([character, i, 3]) | ||
|
||
# for 4 bytes | ||
elif strings[i] & 0xF8 == 0xF0: | ||
temp = strings[i] << 24 | strings[i + 1] << 16 | strings[i + 2] << 8 | strings[i + 3] | ||
character = temp.to_bytes(4, "big").decode("utf-8", "ignore") | ||
i += 3 | ||
character_and_index.append([character, i, 4]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about the other cases? are there any? either way, please add an |
||
|
||
strings = [] # string, start index, end index | ||
|
||
prev = False | ||
|
||
for i in range(0, len(character_and_index)): | ||
if character_and_index[i][0].isprintable() == True: | ||
if prev == False: | ||
strings.append([character_and_index[i][0], character_and_index[i][1], character_and_index[i][1]]) | ||
prev = True | ||
else: | ||
strings[-1][0] += character_and_index[i][0] | ||
strings[-1][2] = character_and_index[i][1] | ||
else: | ||
prev = False | ||
|
||
# filter strings less than min length | ||
strings = [string for string in strings if len(string[0]) >= min_length] | ||
|
||
return strings | ||
|
||
|
||
def main(argv=None): | ||
parser = argparse.ArgumentParser(description="Get Rust strings") | ||
parser.add_argument("path", help="file or path to analyze") | ||
parser.add_argument( | ||
"-n", | ||
"--minimum-length", | ||
dest="min_length", | ||
type=int, | ||
default=MIN_STR_LEN, | ||
help="minimum string length", | ||
) | ||
args = parser.parse_args(args=argv) | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
|
||
pe = pathlib.Path(args.path) | ||
buf = pe.read_bytes() | ||
pe = pefile.PE(data=buf, fast_load=True) | ||
|
||
strings = extract_utf8_strings(pe, args.min_length) | ||
for string in strings: | ||
print(string[0]) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
Uh oh!
There was an error while loading. Please reload this page.