Skip to content

Commit

Permalink
Merge pull request #8 from CybercentreCanada/bugfix/max_score
Browse files Browse the repository at this point in the history
Check for max score after submission is finished analysis
  • Loading branch information
cccs-kevin authored May 30, 2022
2 parents 67e55f5 + 0c9a0dd commit f83c98a
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 31 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ test_dir/*
*.txt
!test/requirements.txt
*.log
__pycache__
__pycache__
*.egg-info/*
15 changes: 12 additions & 3 deletions assemblyline_incident_manager/al_incident_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,14 @@ def main(url: str, username: str, apikey: str, max_score: int, incident_num: str
submission_res = al_client.search.stream.submission(query, fl="sid")
sids = []

print_and_log(log, f"INFO,Gathering the submission IDs.", logging.DEBUG)
print_and_log(log, "INFO,Gathering the submission IDs.", logging.DEBUG)
for submission in submission_res:
sid = submission["sid"]
sids.append(sid)
print_and_log(log, f"INFO,There are {len(sids)} submission IDs.", logging.DEBUG)

total_already_downloaded = 0
for root, dir, files in os.walk(download_path):
for _, _, files in os.walk(download_path):
total_already_downloaded += len(files)

entered = False
Expand All @@ -125,10 +125,19 @@ def main(url: str, username: str, apikey: str, max_score: int, incident_num: str
unique_file_paths = set()
unique_file_hashes = set()
start_time = time()
# "entered" is used so that we always enter this while loop regardless of completion status of sids
while not entered or not all(al_client.submission.is_completed(sid) for sid in sids):
entered = True
for sid in sids[:]:
if not al_client.submission.is_completed(sid):
# Loop until the submission is completed
while not al_client.submission.is_completed(sid):
sleep(2)
continue
# If the submission completes, but the score ends up being higher than the max score
# This any condition should only contain a single item single SIDs are unique
if any(sub["max_score"] > max_score for sub in al_client.search.stream.submission(sid, fl="max_score")):
# Remove the SID since it does not meet the given criteria, and move on!
sids.remove(sid)
continue
else:
sids.remove(sid)
Expand Down
94 changes: 67 additions & 27 deletions assemblyline_incident_manager/al_incident_submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ def get_id_from_data(file_path: str) -> str:
@click.option("--priority", default=100, required=False, type=click.INT,
help="Provide a priority number which will cause the ingestion to go to a specific priority queue.")
@click.option("--do_not_verify_ssl", is_flag=True, help="Ignore SSL errors (insecure!)")
def main(url: str, username: str, apikey: str, ttl: int, classification: str, service_selection: str, is_test: bool, path: str, fresh: bool, incident_num: str, resubmit_dynamic: bool, alert: bool, threads: int, dedup_hashes: bool, priority: int, do_not_verify_ssl: bool):
def main(
url: str, username: str, apikey: str, ttl: int, classification: str, service_selection: str, is_test: bool,
path: str, fresh: bool, incident_num: str, resubmit_dynamic: bool, alert: bool, threads: int,
dedup_hashes: bool, priority: int, do_not_verify_ssl: bool):
"""
Example:
al-incident_submitter --url="https://<domain-of-Assemblyline-instance>" --username="<user-name>" --apikey="/path/to/file/containing/apikey" --classification="<classification>" --service_selection="<service-name>,<service-name>" --path="/path/to/scan" --incident_num=123
Expand All @@ -111,7 +114,8 @@ def main(url: str, username: str, apikey: str, ttl: int, classification: str, se

# Confirm that given path is to a directory
if not os.path.isdir(path):
print_and_log(log, f"INFO,Provided path {path} points to a file, but it should point to a directory.", logging.DEBUG)
print_and_log(
log, f"INFO,Provided path {path} points to a file, but it should point to a directory.", logging.DEBUG)
return

if is_test:
Expand Down Expand Up @@ -192,22 +196,29 @@ def main(url: str, username: str, apikey: str, ttl: int, classification: str, se
# We only care about files that occur after the last sha in the hash file
if resume_ingestion_path:
if prepared_file_path in skipped_file_paths:
print_and_log(log, f"INFO,Found a skipped file path {prepared_file_path}. Trying to ingest again!,{prepared_file_path},",
logging.DEBUG)
file_queue.put((file_path, prepared_file_path, settings, incident_num, alert, file_count, dedup_hashes, True))
print_and_log(
log,
f"INFO,Found a skipped file path {prepared_file_path}. Trying to ingest again!,{prepared_file_path},",
logging.DEBUG)
file_queue.put((file_path, prepared_file_path, settings,
incident_num, alert, file_count, dedup_hashes, True))
continue
elif resume_ingestion_path == prepared_file_path:
print_and_log(log, f"INFO,Found the most recently submitted file path {resume_ingestion_path},{prepared_file_path},",
logging.DEBUG)
print_and_log(
log,
f"INFO,Found the most recently submitted file path {resume_ingestion_path},{prepared_file_path},",
logging.DEBUG)
skip = False

# If we have yet to come up to the file who matches the last submitted file path, continue looking!
if skip:
print_and_log(log,
f"INFO,Seeking the file that matches this file path: {resume_ingestion_path}. {prepared_file_path} has already been ingested.,{prepared_file_path},",
logging.DEBUG)
print_and_log(
log,
f"INFO,Seeking the file that matches this file path: {resume_ingestion_path}. {prepared_file_path} has already been ingested.,{prepared_file_path},",
logging.DEBUG)
continue
file_queue.put((file_path, prepared_file_path, settings, incident_num, alert, file_count, dedup_hashes, False))
file_queue.put((file_path, prepared_file_path, settings,
incident_num, alert, file_count, dedup_hashes, False))

while file_queue.qsize():
sleep(1)
Expand All @@ -223,12 +234,18 @@ def main(url: str, username: str, apikey: str, ttl: int, classification: str, se
print_and_log(log, f"INFO,Number of files ingested = {number_of_files_ingested}", logging.DEBUG)
print_and_log(log, f"INFO,Number of duplicate files on system = {number_of_file_duplicates}", logging.DEBUG)
print_and_log(log, f"INFO,Number of files skipped due to errors = {number_of_files_skipped}", logging.DEBUG)
print_and_log(log, f"INFO,Number of files with size greater than {MAX_FILE_SIZE}B = {number_of_files_greater_than_max_size}", logging.DEBUG)
print_and_log(log, f"INFO,Number of files with size less than {MIN_FILE_SIZE}B = {number_of_files_less_than_min_size}", logging.DEBUG)
print_and_log(
log, f"INFO,Number of files with size greater than {MAX_FILE_SIZE}B = {number_of_files_greater_than_max_size}",
logging.DEBUG)
print_and_log(
log, f"INFO,Number of files with size less than {MIN_FILE_SIZE}B = {number_of_files_less_than_min_size}",
logging.DEBUG)
print_and_log(log, f"INFO,Total time elapsed: {round(time() - start_time, 3)}s", logging.DEBUG)


def _generate_settings(ttl: int, classification: str, service_selection: List[str], resubmit_dynamic: bool, priority: int) -> dict:
def _generate_settings(
ttl: int, classification: str, service_selection: List[str],
resubmit_dynamic: bool, priority: int) -> dict:
settings = {
"ttl": ttl,
"classification": classification,
Expand Down Expand Up @@ -263,11 +280,17 @@ def _file_has_valid_size(file_path: str, prepared_file_path: str) -> (bool, int,
max_count = 0
min_count = 0
if file_size > MAX_FILE_SIZE:
print_and_log(log, f"TOO_LARGE,{prepared_file_path} is too big. Size: {file_size} > {MAX_FILE_SIZE}.,{prepared_file_path},", logging.DEBUG)
print_and_log(
log,
f"TOO_LARGE,{prepared_file_path} is too big. Size: {file_size} > {MAX_FILE_SIZE}.,{prepared_file_path},",
logging.DEBUG)
max_count += 1
return False, max_count, min_count
elif file_size < MIN_FILE_SIZE:
print_and_log(log, f"TOO_SMALL,{prepared_file_path} is too small. Size: {file_size} < {MIN_FILE_SIZE}.,{prepared_file_path},", logging.DEBUG)
print_and_log(
log,
f"TOO_SMALL,{prepared_file_path} is too small. Size: {file_size} < {MIN_FILE_SIZE}.,{prepared_file_path},",
logging.DEBUG)
min_count += 1
return False, max_count, min_count
else:
Expand All @@ -285,17 +308,24 @@ def _test_ingest_file(al_client: Client4, settings: dict, incident_num: str, ale
sha = get_id_from_data(TEST_FILE)

# Ingesting the test file
print_and_log(log, f"INGEST,{TEST_FILE} ({sha}) is about to be ingested in test mode.,{TEST_FILE},{sha}", logging.DEBUG)
al_client.ingest(path=TEST_FILE, fname=TEST_FILE, params=settings, alert=alert, metadata={"filename": TEST_FILE, "incident_number": incident_num})
print_and_log(
log, f"INGEST,{TEST_FILE} ({sha}) is about to be ingested in test mode.,{TEST_FILE},{sha}", logging.DEBUG)
al_client.ingest(path=TEST_FILE, fname=TEST_FILE, params=settings, alert=alert,
metadata={"filename": TEST_FILE, "incident_number": incident_num})
print_and_log(log, f"INGEST,{TEST_FILE} ({sha}) has been ingested in test mode.,{TEST_FILE},{sha}", logging.DEBUG)

os.remove(TEST_FILE)


def _ingest_file(file_path: str, prepared_file_path: str, sha: str, al_client: Client4, settings: dict, incident_num: str, alert: bool):
print_and_log(log, f"INGEST,{prepared_file_path} ({sha}) is about to be ingested.,{prepared_file_path},{sha}", logging.DEBUG)
al_client.ingest(path=file_path, fname=sha, params=settings, alert=alert, metadata={"filename": file_path, "incident_number": incident_num})
print_and_log(log, f"INGEST,{prepared_file_path} ({sha}) has been ingested.,{prepared_file_path},{sha}", logging.DEBUG)
def _ingest_file(
file_path: str, prepared_file_path: str, sha: str, al_client: Client4, settings: dict, incident_num: str,
alert: bool):
print_and_log(
log, f"INGEST,{prepared_file_path} ({sha}) is about to be ingested.,{prepared_file_path},{sha}", logging.DEBUG)
al_client.ingest(path=file_path, fname=sha, params=settings, alert=alert,
metadata={"filename": file_path, "incident_number": incident_num})
print_and_log(
log, f"INGEST,{prepared_file_path} ({sha}) has been ingested.,{prepared_file_path},{sha}", logging.DEBUG)


def _get_most_recent_file_path() -> (bool, str):
Expand Down Expand Up @@ -348,8 +378,13 @@ def _thr_ingest_file(
print_and_log(log, f"INFO,Number of files ingested = {number_of_files_ingested}", logging.DEBUG)
print_and_log(log, f"INFO,Number of duplicate files on system = {number_of_file_duplicates}", logging.DEBUG)
print_and_log(log, f"INFO,Number of files skipped due to errors = {number_of_files_skipped}", logging.DEBUG)
print_and_log(log, f"INFO,Number of files with size greater than {MAX_FILE_SIZE}B = {number_of_files_greater_than_max_size}", logging.DEBUG)
print_and_log(log, f"INFO,Number of files with size less than {MIN_FILE_SIZE}B = {number_of_files_less_than_min_size}", logging.DEBUG)
print_and_log(
log,
f"INFO,Number of files with size greater than {MAX_FILE_SIZE}B = {number_of_files_greater_than_max_size}",
logging.DEBUG)
print_and_log(
log, f"INFO,Number of files with size less than {MIN_FILE_SIZE}B = {number_of_files_less_than_min_size}",
logging.DEBUG)
print_and_log(log, f"INFO,Progress = {round((file_count / total_file_count) * 100, 2)}%", logging.DEBUG)

sha = None
Expand All @@ -366,7 +401,10 @@ def _thr_ingest_file(

# If hash has already been submitted, then skip it
if dedup_hashes and sha in hash_table:
print_and_log(log, f"DUPLICATE,{prepared_file_path} ({sha}) is a duplicate file. Skipping!,{prepared_file_path},{sha}", logging.DEBUG)
print_and_log(
log,
f"DUPLICATE,{prepared_file_path} ({sha}) is a duplicate file. Skipping!,{prepared_file_path},{sha}",
logging.DEBUG)
number_of_file_duplicates += 1
return

Expand All @@ -380,7 +418,8 @@ def _thr_ingest_file(
FILE_PATHS_WRITER.write(f"{prepared_file_path}\n")

except Exception as e:
print_and_log(log, f"SKIP,{prepared_file_path} was skipped due to {e}.,{prepared_file_path},{sha}", logging.ERROR)
print_and_log(
log, f"SKIP,{prepared_file_path} was skipped due to {e}.,{prepared_file_path},{sha}", logging.ERROR)
number_of_files_skipped += 1
SKIPPED_FILE_PATHS_WRITER.write(f"{prepared_file_path}\n")

Expand All @@ -392,7 +431,8 @@ def _thr_queue_reader(queue: Queue, al_client: Client4) -> None:
return
else:
file_path, prepared_file_path, settings, incident_num, alert, file_count, dedup_hashes, was_skipped = msg
_thr_ingest_file(file_path, prepared_file_path, al_client, settings, incident_num, alert, file_count, dedup_hashes, was_skipped)
_thr_ingest_file(file_path, prepared_file_path, al_client, settings,
incident_num, alert, file_count, dedup_hashes, was_skipped)


if __name__ == "__main__":
Expand Down

0 comments on commit f83c98a

Please sign in to comment.