Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent domain matches in windows paths #137

Merged
merged 3 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class PatternMatch(object):

# --- Regex Patterns -----------------------------------------------------------------------------------------------

PAT_DOMAIN = rb"(?i)\b(?:[A-Z0-9-]+\.)+(?:XN--[A-Z0-9]{4,18}|[A-Z]{2,12})(?![A-Z.-])"
PAT_DOMAIN = rb"(?i)(?<![-\w.\\])(?:[A-Z0-9-]+\.)+(?:XN--[A-Z0-9]{4,18}|[A-Z]{2,12})(?![A-Z.-])"
PAT_FILECOM = (
rb"(?i)(?:\b[a-z]?[:]?[- _A-Z0-9.\\~]{0,75}[%]?"
rb"(?:ALLUSERPROFILE|APPDATA|commonappdata|CommonProgramFiles|HOMEPATH|LOCALAPPDATA|"
Expand Down
19 changes: 6 additions & 13 deletions assemblyline_service_utilities/common/dynamic_service_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3097,20 +3097,13 @@ def extract_iocs_from_text_blob(
network_tag_type = "dynamic"

ips = set([ip.value.decode() for ip in find_ips(blob.encode())])
# There is overlap here between regular expressions, so we want to isolate domains that are not ips
domains = set([domain.value.decode() for domain in find_domains(blob.encode())])

# When extracting domains from byte blobs, we need to be careful
if any(domain.startswith("x") for domain in domains):
# Remove all byte characters from blob, then check if domain exists
# Commas cannot exist in a domain so let's replace with that for now
modified_blob = sub(BYTE_STRING, ",", blob)
domains_from_mod_blob = set(findall(DOMAIN_REGEX, modified_blob))
for domain in domains.copy():
if domain[3:] in domains_from_mod_blob:
_ = domains.remove(domain)
domains.add(domain[3:])

# Remove all byte characters from blob, then check if domain exists
# Commas cannot exist in a domain so let's replace with that for now
modified_blob = sub(BYTE_STRING, ",", blob)

# There is overlap here between regular expressions, so we want to isolate domains that are not ips
domains = set([domain.value.decode() for domain in find_domains(modified_blob.encode())])
domains = domains - ips

# There is overlap here between regular expressions, so we want to isolate uris that are not domains
Expand Down
20 changes: 14 additions & 6 deletions test/balbuzard/test_patterns.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

import pytest

from assemblyline_service_utilities.common.balbuzard.patterns import PatternMatch


Expand All @@ -23,15 +24,22 @@ def test_PAT_IP(data, ip):
"data,domain",
[
(b"config.edge.skype.com0", b"config.edge.skype.com"),
(b"domain.com-", None),
],
)
def test_PAT_DOMAIN(data, domain):
match = re.search(PatternMatch.PAT_DOMAIN, data)
if domain is None:
assert match is None
else:
assert match.group() == domain
assert re.search(PatternMatch.PAT_DOMAIN, data).group() == domain


@pytest.mark.parametrize(
"domain",
[
b"domain.com-",
b"C:\\path\\looks-like-a-domain.com",
b"C:\\path\\looks.like.a.domain.com",
],
)
def test_PAT_DOMAIN_false_positives(domain):
assert not re.search(PatternMatch.PAT_DOMAIN, domain)


@pytest.mark.parametrize(
Expand Down