Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ghidra: Implement GhidraFeatureExtractor #1681

Merged
merged 35 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
1235a79
restore from corrupted .git
colton-gabertan Jul 26, 2023
26cfd47
lint repo
colton-gabertan Jul 26, 2023
8b87b21
Merge branch 'backend-ghidra' into ghidra-insn-features
colton-gabertan Jul 26, 2023
ecc45ed
temp: remove lint failing rule
colton-gabertan Jul 26, 2023
e5af05f
implement dereferencing, clean up extractors
colton-gabertan Aug 2, 2023
be9bc76
implement proper dereferencing routines as applicable
colton-gabertan Aug 2, 2023
70ed9aa
fix nzxor implementation, remediate ghidra analysis issues
colton-gabertan Aug 2, 2023
1f2f02a
lint repo
colton-gabertan Aug 2, 2023
d3a4599
Assert typing, lint repo
colton-gabertan Aug 2, 2023
37041cb
avoid extracting pointers in bytes extraction
colton-gabertan Aug 2, 2023
cdb9c45
Merge branch 'backend-ghidra' into ghidra-insn-features
colton-gabertan Aug 2, 2023
948796f
attempt to recover submodule
colton-gabertan Aug 2, 2023
fc0eb4b
implement GhidraFeatureExtractor & ghidra_main()
colton-gabertan Aug 4, 2023
c29bada
lint repo
colton-gabertan Aug 4, 2023
270290f
document examples, clean-up & testing
colton-gabertan Aug 6, 2023
a3cf968
Merge branch 'ghidra-insn-features' into ghidra-extractor-main
colton-gabertan Aug 6, 2023
7e52201
lint repo
colton-gabertan Aug 6, 2023
d155c0c
Merge branch 'ghidra-insn-features' into ghidra-extractor-main
colton-gabertan Aug 6, 2023
01862e5
properly map import dict
colton-gabertan Aug 6, 2023
93b6d1b
properly map fake addresses
colton-gabertan Aug 6, 2023
b68adc2
fix fake addr mapping
colton-gabertan Aug 6, 2023
ffeb591
Merge branch 'ghidra-insn-features' into ghidra-extractor-main
colton-gabertan Aug 6, 2023
c61dc45
properly map externs
colton-gabertan Aug 8, 2023
a10a779
Merge branch 'ghidra-insn-features' into ghidra-extractor-main
colton-gabertan Aug 8, 2023
c968cfa
re-align consistency with other backends
colton-gabertan Aug 8, 2023
7cfbafa
Merge branch 'backend-ghidra' into ghidra-extractor-main
colton-gabertan Aug 8, 2023
c3e690e
lint repo
colton-gabertan Aug 8, 2023
4579a16
fix dereferencing routine
colton-gabertan Aug 8, 2023
f000ae8
clean up helpers
colton-gabertan Aug 8, 2023
e512d21
fix format string
colton-gabertan Aug 8, 2023
6c717f9
disable progress bar to exit gracefully
colton-gabertan Aug 8, 2023
1390a23
enable pbar in headless runtime mode
colton-gabertan Aug 10, 2023
126c4eb
Merge branch 'backend-ghidra' into ghidra-extractor-main
colton-gabertan Aug 10, 2023
0de638a
refactor repo for breaking Ghidrathon change
colton-gabertan Aug 15, 2023
6c6bbce
fix ghidra import issue
colton-gabertan Aug 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions capa/features/extractors/ghidra/basicblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.basicblock import BasicBlock
from capa.features.extractors.helpers import MIN_STACKSTRING_LEN
from capa.features.extractors.base_extractor import BBHandle, FunctionHandle

listing = currentProgram.getListing() # type: ignore [name-defined] # noqa: F821

Expand Down Expand Up @@ -116,7 +117,7 @@ def extract_bb_tight_loop(bb: ghidra.program.model.block.CodeBlock) -> Iterator[
)


def extract_features(bb: ghidra.program.model.block.CodeBlock) -> Iterator[Tuple[Feature, Address]]:
def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""
extract features from the given basic block.

Expand All @@ -126,6 +127,7 @@ def extract_features(bb: ghidra.program.model.block.CodeBlock) -> Iterator[Tuple
yields:
Tuple[Feature, int]: the features and their location found in this basic block.
"""
bb = bbh.inner
yield BasicBlock(), AbsoluteVirtualAddress(bb.getMinAddress().getOffset())
for bb_handler in BASIC_BLOCK_HANDLERS:
for feature, addr in bb_handler(bb):
Expand All @@ -135,8 +137,10 @@ def extract_features(bb: ghidra.program.model.block.CodeBlock) -> Iterator[Tuple
def main():
features = []
for fhandle in capa.features.extractors.ghidra.helpers.get_function_symbols():
fh = FunctionHandle(address=fhandle.getBody().getMinAddress().getOffset(), inner=fhandle)
for bb in SimpleBlockIterator(BasicBlockModel(currentProgram), fhandle.getBody(), monitor): # type: ignore [name-defined] # noqa: F821
features.extend(list(extract_features(bb)))
bbh = BBHandle(address=bb.getMinAddress(), inner=bb)
features.extend(list(extract_features(fh, bbh)))

import pprint

Expand Down
44 changes: 38 additions & 6 deletions capa/features/extractors/ghidra/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import List, Tuple

import ghidra
from typing import List, Tuple, Iterator

import capa.features.extractors.ghidra.file
import capa.features.extractors.ghidra.insn
import capa.features.extractors.ghidra.global_
import capa.features.extractors.ghidra.function
import capa.features.extractors.ghidra.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import FeatureExtractor

currentProgram: ghidra.program.database.ProgramDB
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor


class GhidraFeatureExtractor(FeatureExtractor):
Expand All @@ -33,3 +33,35 @@ def extract_global_features(self):

def extract_file_features(self):
yield from capa.features.extractors.ghidra.file.extract_features()

def get_functions(self) -> Iterator[FunctionHandle]:
import capa.features.extractors.ghidra.helpers as ghidra_helpers

for f in ghidra_helpers.get_function_symbols():
addr = f.getBody().getMinAddress().getOffset()
yield FunctionHandle(address=AbsoluteVirtualAddress(addr), inner=f)

@staticmethod
def get_function(addr: int) -> FunctionHandle:
get_addr = currentAddress.getAddress(hex(addr)) # type: ignore [name-defined] # noqa: F821
colton-gabertan marked this conversation as resolved.
Show resolved Hide resolved
func = getFunctionContaining(get_addr) # type: ignore [name-defined] # noqa: F821
return FunctionHandle(address=AbsoluteVirtualAddress(func.getAddress().getOffset()), inner=func)

def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.ghidra.function.extract_features(fh)

def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
import capa.features.extractors.ghidra.helpers as ghidra_helpers

yield from ghidra_helpers.get_function_blocks(fh)

def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.ghidra.basicblock.extract_features(fh, bbh)

def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
import capa.features.extractors.ghidra.helpers as ghidra_helpers

yield from ghidra_helpers.get_insn_in_range(bbh)

def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
yield from capa.features.extractors.ghidra.insn.extract_features(fh, bbh, ih)
5 changes: 3 additions & 2 deletions capa/features/extractors/ghidra/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from capa.features.common import Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors import loops
from capa.features.extractors.base_extractor import FunctionHandle


def extract_function_calls_to(fh: ghidra.program.database.function.FunctionDB):
Expand Down Expand Up @@ -44,9 +45,9 @@ def extract_recursive_call(fh: ghidra.program.database.function.FunctionDB):
yield Characteristic("recursive call"), AbsoluteVirtualAddress(fh.getEntryPoint().getOffset())


def extract_features(fh: ghidra.program.database.function.FunctionDB) -> Iterator[Tuple[Feature, Address]]:
def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
for func_handler in FUNCTION_HANDLERS:
for feature, addr in func_handler(fh):
for feature, addr in func_handler(fh.inner):
colton-gabertan marked this conversation as resolved.
Show resolved Hide resolved
yield feature, addr


Expand Down
181 changes: 180 additions & 1 deletion capa/features/extractors/ghidra/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,17 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import Iterator
from typing import Any, Dict, List, Iterator

import ghidra
from ghidra.program.model.lang import OperandType
from ghidra.program.model.block import BasicBlockModel, SimpleBlockIterator
from ghidra.program.model.symbol import SourceType, SymbolType
from ghidra.program.model.address import AddressSpace

import capa.features.extractors.helpers
from capa.features.address import AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle


def fix_byte(b: int) -> bytes:
Expand Down Expand Up @@ -70,3 +78,174 @@ def get_function_symbols() -> Iterator[ghidra.program.database.function.Function
"""yield all non-external function symbols"""

yield from currentProgram.getFunctionManager().getFunctionsNoStubs(True) # type: ignore [name-defined] # noqa: F821


def get_function_blocks(fh: ghidra.program.database.function.FunctionDB) -> Iterator[BBHandle]:
"""yield BBHandle for each bb in a given function"""

func = fh.inner
for bb in SimpleBlockIterator(BasicBlockModel(currentProgram), func.getBody(), monitor): # type: ignore [name-defined] # noqa: F821
yield BBHandle(address=AbsoluteVirtualAddress(bb.getMinAddress().getOffset()), inner=bb)
colton-gabertan marked this conversation as resolved.
Show resolved Hide resolved


def get_insn_in_range(bbh: BBHandle) -> Iterator[InsnHandle]:
"""yield InshHandle for each insn in a given basicblock"""

bb = bbh.inner
for addr in bb.getAddresses(True):
insn = getInstructionAt(addr) # type: ignore [name-defined] # noqa: F821
if insn:
yield InsnHandle(address=AbsoluteVirtualAddress(insn.getAddress().getOffset()), inner=insn)
colton-gabertan marked this conversation as resolved.
Show resolved Hide resolved


def get_file_imports() -> Dict[int, Any]:
"""get all import names & addrs"""

addrs = []
names = []

for f in currentProgram.getFunctionManager().getExternalFunctions(): # type: ignore [name-defined] # noqa: F821
for r in f.getSymbol().getReferences():
if r.getReferenceType().isData():
addr = r.getFromAddress().getOffset() # gets pointer to fake external addr

fstr = f.toString().split("::") # format: MODULE.dll::import / MODULE::Ordinal_*
if "Ordinal_" in fstr[1]:
fstr[1] = f"#{fstr[1].split('_')[1]}"

for name in capa.features.extractors.helpers.generate_symbols(fstr[0][:-4], fstr[1]):
addrs.append(addr)
names.append(name)

return dict(zip(addrs, names))


def get_file_externs() -> Dict[int, Any]:
addrs = []
names = []

for sym in currentProgram.getSymbolTable().getAllSymbols(True): # type: ignore [name-defined] # noqa: F821
# .isExternal() misses more than this config for the function symbols
if sym.getSymbolType() == SymbolType.FUNCTION and sym.getSource() == SourceType.ANALYSIS and sym.isGlobal():
name = sym.getName() # starts to resolve names based on Ghidra's FidDB
if name.startswith("FID_conflict:"): # format: FID_conflict:<function-name>
name = name[13:]
addrs.append(sym.getAddress().getOffset())
names.append(name)
if name.startswith("_"):
# some linkers may prefix linked routines with a `_` to avoid name collisions.
# extract features for both the mangled and un-mangled representations.
# e.g. `_fwrite` -> `fwrite`
# see: https://stackoverflow.com/a/2628384/87207
names.append(name[1:])

return dict(zip(addrs, names))


def map_fake_import_addrs() -> Dict[int, int]:
real_addrs = []
fake_addrs = []

for f in currentProgram.getFunctionManager().getExternalFunctions(): # type: ignore [name-defined] # noqa: F821
fake_addrs.append(f.getEntryPoint().getOffset())
for r in f.getSymbol().getReferences():
if r.getReferenceType().isData():
real_addrs.append(r.getFromAddress().getOffset())

return dict(zip(fake_addrs, real_addrs))


def get_external_locs() -> List[int]:
locs = []
for fh in currentProgram.getFunctionManager().getExternalFunctions(): # type: ignore [name-defined] # noqa: F821
external_loc = fh.getExternalLocation().getAddress()
if external_loc:
locs.append(external_loc)
return locs


def check_addr_for_api(
addr: ghidra.program.model.address.Address,
fakes: Dict[int, int],
imports: Dict[int, int],
externs: Dict[int, int],
ex_locs: List[int],
) -> bool:
offset = addr.getOffset()

fake = fakes.get(offset)
if fake:
return True

imp = imports.get(offset)
if imp:
return True

extern = externs.get(offset)
if extern:
return True

if addr in ex_locs:
return True

return False


def is_call_or_jmp(insn: ghidra.program.database.code.InstructionDB) -> bool:
return any(mnem in insn.getMnemonicString() for mnem in ["CALL", "J"]) # JMP, JNE, JNZ, etc


def is_sp_modified(insn: ghidra.program.database.code.InstructionDB) -> bool:
for i in range(insn.getNumOperands()):
if insn.getOperandType(i) == OperandType.REGISTER:
return "SP" in insn.getRegister(i).getName() and insn.getOperandRefType(i).isWrite()
return False


def is_xor_on_stack(insn: ghidra.program.database.code.InstructionDB) -> bool:
is_true = False
for i in range(insn.getNumOperands()):
if insn.getOperandType(i) == OperandType.REGISTER:
if any(mnem in insn.getRegister(i).getName() for mnem in ["SP", "BP"]):
is_true = True

return is_true


def is_stack_referenced(insn: ghidra.program.database.code.InstructionDB) -> bool:
# does not work for non-branching insn
return any(ref.isStackReference() for ref in insn.getReferencesFrom())


def is_zxor(insn: ghidra.program.database.code.InstructionDB) -> bool:
# assume XOR insn
# XOR's against the same operand zero out
ops = []
operands = []
for i in range(insn.getNumOperands()):
ops.append(insn.getOpObjects(i))

# Operands stored in a 2D array
for j in range(len(ops)):
for k in range(len(ops[j])):
operands.append(ops[j][k])

return all(n == operands[0] for n in operands)


def dereference_ptr(insn: ghidra.program.database.code.InstructionDB):
to_deref = insn.getAddress(0)
dat = getDataContaining(to_deref) # type: ignore [name-defined] # noqa: F821
if not dat:
return to_deref
if dat.isDefined() and dat.isPointer():
addr = dat.getValue()
# now we need to check the addr space to see if it is truly resolvable
# ghidra sometimes likes to hand us direct RAM addrs, which typically point
# to api calls that we can't actually resolve as such
if addr.getAddressSpace().getType() == AddressSpace.TYPE_RAM:
return to_deref
else:
return addr
else:
return to_deref
Loading
Loading