Skip to content

Commit

Permalink
feat: add khash in exported class and documents
Browse files Browse the repository at this point in the history
  • Loading branch information
davendu committed Apr 29, 2024
1 parent 822461f commit 9b0c40e
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 3 deletions.
43 changes: 42 additions & 1 deletion docs/quick_start.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,47 @@
" print(\"----\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also get a file's KHash, which can be used to compare similarities:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A<->B: 0.958984375\n",
"A<->C: 0.583984375\n",
"B<->C: 0.580078125\n"
]
}
],
"source": [
"from binaryai import BinaryAIFile\n",
"\n",
"fileA = BinaryAIFile(bai, md5=\"346136457e1eb6eca44a06bb55f93284\").get_khash_info()\n",
"fileB = BinaryAIFile(bai, sha256=\"841de34799fc46bf4b926559e4e7a70e0cc386050963978d5081595e9a280ae1\").get_khash_info()\n",
"fileC = BinaryAIFile(bai, sha256=\"9b53a3936c8c4202e418c37cbadeaef7cc7471f6a6522f6ead1a19b31831f4a1\").get_khash_info()\n",
"assert fileA[1] == fileB[1]\n",
"assert fileB[1] == fileC[1]\n",
"\n",
"# calculate hamming distance\n",
"def khash_similarity(khash_a: str, khash_b: str):\n",
" from scipy.spatial import distance\n",
" khash_a, khash_b = list(bin(int(khash_a, 16))[2:]), list(bin(int(khash_b, 16))[2:])\n",
" return 1 - distance.hamming(khash_a, khash_b)\n",
"print(f\"A<->B: {khash_similarity(fileA[0].hex(), fileB[0].hex())}\")\n",
"print(f\"A<->C: {khash_similarity(fileA[0].hex(), fileC[0].hex())}\")\n",
"print(f\"B<->C: {khash_similarity(fileB[0].hex(), fileC[0].hex())}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -224,7 +265,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
},
"orig_nbformat": 4
},
Expand Down
1 change: 1 addition & 0 deletions examples/binaryai_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def main():

print(bf1_files)
print(bf2_files)
print(bf1.get_khash_info())

print("done")

Expand Down
89 changes: 88 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "binaryai"
version = "0.7.1"
version = "0.8.0"
description = "BinaryAI-SDK is a library provides an abstracted client module to simplify the procedure of uploading file for analysis."
license = "GPLv3"
authors = ["binaryai <binaryai@tencent.com>"]
Expand All @@ -26,6 +26,7 @@ black = "22.6.0"
isort = "5.10.1"
ariadne-codegen = "^0.13.0"
pytest = "^7.4.2"
scipy = "^1.13.0"

[tool.poetry.group.docs.dependencies]
sphinx-autoapi = "^2.1.1"
Expand Down
8 changes: 8 additions & 0 deletions src/binaryai/binaryai_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,11 @@ def get_func_match(self, offset: int) -> List[MatchedFunction]:
The List is sorted by score from high to low.
"""
return self._bai.get_func_match(self.sha256, offset)

def get_khash_info(self) -> tuple[bytes, str]:
"""Return the KHash of this file. See website for detailed introduction on KHash.
Returns:
Tuple[bytes, str]: KHash's value and version. Only compare if version is same.
"""
return self._bai.get_khash_info(self.sha256)
21 changes: 21 additions & 0 deletions src/binaryai/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import binascii
import logging
import os
import threading
Expand Down Expand Up @@ -564,3 +565,23 @@ def get_func_match(self, sha256: str, offset: int) -> List[MatchedFunction]:
)
matched_func_list.append(matched_func)
return matched_func_list

def get_khash_info(self, sha256: str) -> Optional[tuple[bytes, str]]:
"""Return the KHash of this file. See website for detailed introduction on KHash.
Returns:
Optional[Tuple[bytes, str]]: KHash's value and version. Only compare if version is same.
You are not expected to parse version.
"""
m = self._client.file_k_hash(sha256)
if not m.file:
return None
if not m.file.decompile_result:
return None
if not m.file.decompile_result.k_hash_info:
return None
# wiring protocol of hash value is a hex sequence
return (
binascii.unhexlify(m.file.decompile_result.k_hash_info.hash.hash),
m.file.decompile_result.k_hash_info.hash.version,
)

0 comments on commit 9b0c40e

Please sign in to comment.