Skip to content

Commit

Permalink
Merge branch 'main' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
cccs-jh committed Feb 23, 2024
2 parents 28b8d41 + fb164bf commit 4ee9423
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class PatternMatch(object):

# --- Regex Patterns -----------------------------------------------------------------------------------------------

PAT_DOMAIN = rb"(?i)\b(?:[A-Z0-9-]+\.)+(?:XN--[A-Z0-9]{4,18}|[A-Z]{2,12})(?![A-Z.-])"
PAT_DOMAIN = rb"(?i)(?<![-\w.\\])(?:[A-Z0-9-]+\.)+(?:XN--[A-Z0-9]{4,18}|[A-Z]{2,12})(?![A-Z.-])"
PAT_FILECOM = (
rb"(?i)(?:\b[a-z]?[:]?[- _A-Z0-9.\\~]{0,75}[%]?"
rb"(?:ALLUSERPROFILE|APPDATA|commonappdata|CommonProgramFiles|HOMEPATH|LOCALAPPDATA|"
Expand Down
19 changes: 6 additions & 13 deletions assemblyline_service_utilities/common/dynamic_service_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3097,20 +3097,13 @@ def extract_iocs_from_text_blob(
network_tag_type = "dynamic"

ips = set([ip.value.decode() for ip in find_ips(blob.encode())])
# There is overlap here between regular expressions, so we want to isolate domains that are not ips
domains = set([domain.value.decode() for domain in find_domains(blob.encode())])

# When extracting domains from byte blobs, we need to be careful
if any(domain.startswith("x") for domain in domains):
# Remove all byte characters from blob, then check if domain exists
# Commas cannot exist in a domain so let's replace with that for now
modified_blob = sub(BYTE_STRING, ",", blob)
domains_from_mod_blob = set(findall(DOMAIN_REGEX, modified_blob))
for domain in domains.copy():
if domain[3:] in domains_from_mod_blob:
_ = domains.remove(domain)
domains.add(domain[3:])

# Remove all byte characters from blob, then check if domain exists
# Commas cannot exist in a domain so let's replace with that for now
modified_blob = sub(BYTE_STRING, ",", blob)

# There is overlap here between regular expressions, so we want to isolate domains that are not ips
domains = set([domain.value.decode() for domain in find_domains(modified_blob.encode())])
domains = domains - ips

# There is overlap here between regular expressions, so we want to isolate uris that are not domains
Expand Down
20 changes: 14 additions & 6 deletions test/balbuzard/test_patterns.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

import pytest

from assemblyline_service_utilities.common.balbuzard.patterns import PatternMatch


Expand All @@ -23,15 +24,22 @@ def test_PAT_IP(data, ip):
"data,domain",
[
(b"config.edge.skype.com0", b"config.edge.skype.com"),
(b"domain.com-", None),
],
)
def test_PAT_DOMAIN(data, domain):
match = re.search(PatternMatch.PAT_DOMAIN, data)
if domain is None:
assert match is None
else:
assert match.group() == domain
assert re.search(PatternMatch.PAT_DOMAIN, data).group() == domain


@pytest.mark.parametrize(
"domain",
[
b"domain.com-",
b"C:\\path\\looks-like-a-domain.com",
b"C:\\path\\looks.like.a.domain.com",
],
)
def test_PAT_DOMAIN_false_positives(domain):
assert not re.search(PatternMatch.PAT_DOMAIN, domain)


@pytest.mark.parametrize(
Expand Down

0 comments on commit 4ee9423

Please sign in to comment.