Skip to content

Commit

Permalink
Merge pull request #131 from CybercentreCanada/bugfix/duplicate_name_…
Browse files Browse the repository at this point in the history
…hashes

Bugfix/duplicate name hashes
  • Loading branch information
cccs-rs authored Jan 25, 2024
2 parents 730f000 + d964220 commit 0201d6b
Show file tree
Hide file tree
Showing 3 changed files with 624 additions and 47 deletions.
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,10 @@
"--max-line-length=120",
//Added the ignore of E203 for now : https://github.com/PyCQA/pycodestyle/issues/373
"--ignore=E203,W503"
]
],
"python.testing.pytestArgs": [
"test"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
114 changes: 68 additions & 46 deletions assemblyline_service_utilities/testing/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
from pathlib import Path

import pytest
from assemblyline_v4_service.common import helper
from assemblyline_v4_service.common.request import ServiceRequest
from assemblyline_v4_service.common.task import Task

from assemblyline.common import forge
from assemblyline.common.dict_utils import flatten
from assemblyline.common.str_utils import truncate
from assemblyline.common.uid import get_random_id
from assemblyline.odm.messages.task import Task as ServiceTask
from assemblyline_v4_service.common import helper
from assemblyline_v4_service.common.request import ServiceRequest
from assemblyline_v4_service.common.task import Task
from cart import unpack_file


Expand Down Expand Up @@ -503,53 +502,76 @@ def _data_compare(ih: IssueHelper, original, new, data_type, ignore_new_extra_fi

@staticmethod
def _file_compare(ih: IssueHelper, f_type, original, new):
# Create a set of maps for lookups between sha256 and names
oh_map, on_map, nh_map, nn_map = {}, {}, {}, {}
[
(
oh_map.setdefault(x["sha256"], []).append(x["name"]),
on_map.setdefault(x["name"], []).append(x["sha256"])
)
for x in original
]
[
(
nh_map.setdefault(x["sha256"], []).append(x["name"]),
nn_map.setdefault(x["name"], []).append(x["sha256"])
)
for x in new
]
if original == new:
# These file lists are exactly identical, no need to process for differences
return
elif len(new) == len(original) and all(n in original for n in new):
# The file lists, although the contents might be out of order, are the same
return

# Prune out items in the lists where nothing changed
for file in list(original):
if file in original and file in new:
original.remove(file)
new.remove(file)

# All remaining files in the new file list are assumed to have been additions to the original
change_record = [(ih.ACTION_ADDED, x["name"], x["sha256"]) for x in new]

# Check to see if any of the files in the old list have been changed when comparing against the new files
for x in original:
name, sha256 = x["name"], x["sha256"]

# We assume that if there's any change, it only applies to one file at a time
change_action = None

# During this check, we are going to priorize changes that involve hash for the same filename
hash_changes = [c for c in change_record if c[0] == ih.ACTION_ADDED and c[1] == name and c[2] != sha256]
for r in hash_changes:
# The filename is the same but the hashes are different
change_action = (ih.ACTION_CHANGED, "hash_change", name, sha256, r[2])
change_record.insert(change_record.index(r), change_action)
change_record.remove(r)
break

if change_action:
continue

oh_map = {x["sha256"]: x["name"] for x in original}
on_map = {x["name"]: x["sha256"] for x in original}
nh_map = {x["sha256"]: x["name"] for x in new}
nn_map = {x["name"]: x["sha256"] for x in new}
name_changes = [c for c in change_record if c[0] == ih.ACTION_ADDED and c[1] != name and c[2] == sha256]
for r in name_changes:
# The hash is the same but the filenames are different
change_action = (ih.ACTION_CHANGED, "name_change", sha256, name, r[1])
change_record.insert(change_record.index(r), change_action)
change_record.remove(r)
break

for sha256, name in oh_map.items():
if sha256 not in nh_map:
if name not in nn_map:
ih.add_issue(f_type, ih.ACTION_MISSING, f"File '{name} [{sha256}]' missing from the file list.")
continue
if change_action:
continue

if sha256 not in nn_map[name]:
ih.add_issue(
f_type,
ih.ACTION_CHANGED,
f"The sha256 of the file '{name}' has changed. {sha256} -> {nn_map[name]}",
)
continue
# Assume this is a old file that's missing in the output
change_record.append((ih.ACTION_MISSING, name, sha256))

if name not in nh_map[sha256]:
ih.add_issue(
f_type,
ih.ACTION_CHANGED,
f"The name of the file '{sha256}' has changed. {name} -> {nh_map[sha256]}",
)
continue
sha256_change_msg = "The sha256 of the file '{}' has changed. {} -> {}"
name_change_msg = "The name of the file '{}' has changed. {} -> {}"
file_added_msg = "File '{} [{}]' added to the file list."
file_missing_msg = "File '{} [{}]' missing from the file list."

# Process the change record for issue handling in the given order: ACTION_MISSING, ACTION_CHANGED, ACTION_ADDED
for record in sorted(change_record, key=lambda x: ["--", "-+", "++"].index(x[0])):
message = None
if record[0] == ih.ACTION_CHANGED:
if record[1] == "hash_change":
message = sha256_change_msg.format(*record[2:])

elif record[1] == "name_change":
message = name_change_msg.format(*record[2:])

elif record[0] == ih.ACTION_ADDED:
message = file_added_msg.format(*record[1:])
elif record[0] == ih.ACTION_MISSING:
message = file_missing_msg.format(*record[1:])

for sha256, name in nh_map.items():
if sha256 not in oh_map and name not in on_map:
ih.add_issue(f_type, ih.ACTION_ADDED, f"File '{name} [{sha256}]' added to the file list.")
ih.add_issue(f_type, record[0], message)

def regenerate_results(self, save_files=False, sample_sha256=""):
for f in self.result_list():
Expand Down
Loading

0 comments on commit 0201d6b

Please sign in to comment.