Skip to content

Wrapping up

The two snippets presented in the previous pages can be concatenated in a single extract you can run on your binaries to get a basic feature extractor.

#  Copyright 2022-2023 Quarkslab
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Feature Extractor

This snippet uses Quokka to extract features from the every function (and block) of the program.

Usage:
    python ./script <binary_path>

Author:
    Written by dm (Alexis Challande) in 2022.
"""

from __future__ import annotations

import json
import sys
from typing import Dict, Union, List
import quokka

# Use the code from arch.py in this repo
# Originally
# https://github.com/Cisco-Talos/binary_function_similarity/blob/main/IDA_scripts/IDA_acfg_features/core/architecture.py
ARCH_MNEM = ...

FeaturesDict = Dict[str, Union[int, List[str], List[int], "FeaturesDict"]]


def get_bb_features(block: quokka.Block) -> FeaturesDict:
    """Extract features from a Basic Block"""

    mnemonics = [inst.cs_inst.mnemonic for inst in block.instructions]
    arch = block.program.isa.name

    return {
        "bb_len": block.size,
        # List features
        "bb_numerics": block.constants,
        "bb_strings": block.strings,
        # Numeric features
        "n_numeric_consts": len(block.constants),
        "n_string_consts": len(block.strings),
        "n_instructions": len(mnemonics),
        "n_arith_instrs": sum(
            1 for m in mnemonics if m in ARCH_MNEM[arch]["arithmetic"]
        ),
        "n_call_instrs": sum(1 for m in mnemonics if m in ARCH_MNEM[arch]["call"]),
        "n_logic_instrs": sum(1 for m in mnemonics if m in ARCH_MNEM[arch]["logic"]),
        "n_transfer_instrs": sum(
            1 for m in mnemonics if m in ARCH_MNEM[arch]["transfer"]
        ),
        "n_redirect_instrs": sum(
            1
            for m in mnemonics
            if (m in ARCH_MNEM[arch]["unconditional"])
            or (m in ARCH_MNEM[arch]["conditional"])
            or (m in ARCH_MNEM[arch]["call"])
        ),
    }


def sum_block_features(bb_features: FeaturesDict, feature: str) -> int:
    """Sum the values for every basic block in the function"""
    assert feature.startswith("n_"), "Only numeric values can be summed"
    return sum(basic_block[feature] for basic_block in bb_features.values())


def get_func_features(func: quokka.Function) -> FeaturesDict:
    """Extracts features from a Function"""
    bb_features = {}
    for block_start in func.graph:
        block = func.get_block(block_start)
        bb_features[block_start] = get_bb_features(block)

    return {
        "n_func_calls": sum_block_features(bb_features, "n_call_instrs"),
        "n_logic_instrs": sum_block_features(bb_features, "n_logic_instrs"),
        "n_redirections": sum_block_features(bb_features, "n_redirect_instrs"),
        "n_transfer_instrs": sum_block_features(bb_features, "n_transfer_instrs"),
        "size_local_variables": ...,  # Not possible with Quokka
        "n_bb": len(bb_features),
        "n_edges": len(func.graph.edges),
        "n_incoming_calls": len(func.callers),
        "n_instructions": sum(1 for _ in func.instructions),
        "basic_blocks": bb_features,
    }


def export_binary(binary: quokka.Program) -> None:
    """Export features from a Program"""

    prog_features: FeaturesDict = {}
    for func in binary.values():
        prog_features[func.start] = get_func_features(func)

    with open(f"{binary.name}.json", "w") as fp:
        json.dump(prog_features, fp, indent=True)


if __name__ == "main":
    program: quokka.Program = quokka.Program.from_binary(sys.argv[1])
    export_binary(program)