diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py index cb0838443..4f64dd78a 100644 --- a/assemblyline/datastore/collection.py +++ b/assemblyline/datastore/collection.py @@ -8,19 +8,19 @@ import typing import warnings -from copy import deepcopy -from assemblyline.common.isotime import now_as_iso -from datemath import dm -from datemath.helpers import DateMathException from datetime import datetime from enum import Enum from os import environ -from typing import Dict, Any, Union, TypeVar, Generic +from typing import Dict, Any, Union, TypeVar, Generic, Optional +from copy import deepcopy +from datemath import dm +from datemath.helpers import DateMathException import elasticsearch import elasticsearch.helpers from assemblyline import odm +from assemblyline.common.isotime import now_as_iso from assemblyline.common.dict_utils import recursive_update from assemblyline.datastore.bulk import ElasticBulkPlan from assemblyline.datastore.exceptions import ( @@ -222,8 +222,8 @@ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, ar if field.store: self.stored_fields[name] = field - def is_archive_index(self, index): - return self.archive_name and index.startswith(self.archive_name) + def is_archive_index(self, index) -> bool: + return bool(self.archive_name and index.startswith(self.archive_name)) def get_index_list(self, index_type): # Default value @@ -2032,17 +2032,17 @@ def __get_possible_fields(self, field): return field_types - def _check_fields(self, model=None): + def _check_fields(self, target_model: Optional[odm.Model] = None): if not self.validate: return - if model is None: + if target_model is None: if self.model_class: return self._check_fields(self.model_class) return fields = self.fields() - model = self.model_class.flat_fields(skip_mappings=True) + model = target_model.flat_fields(skip_mappings=True) missing = set(model.keys()) - set(fields.keys()) if missing: @@ -2071,7 +2071,7 @@ def _ensure_collection(self): index = f"{alias}_hot" # Create HOT index if not self.with_retries(self.datastore.client.indices.exists, index=alias): - log.debug(f"Index {alias.upper()} does not exists. Creating it now...") + log.debug("Index %s does not exists. Creating it now...", alias.upper()) try: self.with_retries(self.datastore.client.indices.create, index=index, mappings=self._get_index_mappings(), @@ -2079,7 +2079,7 @@ def _ensure_collection(self): except elasticsearch.exceptions.RequestError as e: if "resource_already_exists_exception" not in str(e): raise - log.warning(f"Tried to create an index template that already exists: {alias.upper()}") + log.warning("Tried to create an index template that already exists: %s", alias.upper()) self.with_retries(self.datastore.client.indices.put_alias, index=index, name=alias) elif not self.with_retries(self.datastore.client.indices.exists, index=index) and \ diff --git a/assemblyline/datastore/support/build.py b/assemblyline/datastore/support/build.py index 4f713100b..1b0af6f5c 100644 --- a/assemblyline/datastore/support/build.py +++ b/assemblyline/datastore/support/build.py @@ -1,5 +1,5 @@ from assemblyline.odm.base import _Field -from assemblyline.odm import Keyword, Text, List, Compound, Date, Integer, Long, \ +from assemblyline.odm import Keyword, Wildcard, Text, List, Compound, Date, Integer, Long, \ Float, Boolean, Mapping, Classification, Enum, Any, UUID, Optional, IP, Domain, URI, URIPath, MAC, PhoneNumber, \ SSDeepHash, SHA1, SHA256, MD5, Platform, Processor, ClassificationString, FlattenedObject, Email, UpperKeyword, \ Json, ValidatedKeyword, UNCPath @@ -7,6 +7,7 @@ # Simple types can be resolved by a direct mapping __type_mapping = { Keyword: 'keyword', + Wildcard: 'wildcard', Boolean: 'boolean', Integer: 'integer', Long: 'long', @@ -111,6 +112,11 @@ def set_mapping(temp_field: _Field, body): "analyzer": __analyzer_mapping[field.__class__] }) + elif isinstance(field, Wildcard): + es_data_type = __type_mapping[field.__class__] + data = {'type': es_data_type} + mappings[name.strip(".")] = data + elif isinstance(field, Keyword): es_data_type = __type_mapping[field.__class__] data = {'type': es_data_type} diff --git a/assemblyline/odm/__init__.py b/assemblyline/odm/__init__.py index 88bef276a..6a99ec36c 100644 --- a/assemblyline/odm/__init__.py +++ b/assemblyline/odm/__init__.py @@ -5,7 +5,8 @@ # Imports that have the same effect as some part of the one above so that # type checking can use this file properly. -from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, Date, Long, Enum +from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, \ + Date, Long, Enum, Wildcard from datetime import datetime _InnerType = typing.TypeVar("_InnerType") @@ -27,6 +28,10 @@ def keyword(*args, **kwargs) -> str: return typing.cast(str, Keyword(*args, **kwargs)) +def wildcard(*args, **kwargs) -> str: + return typing.cast(str, Wildcard(*args, **kwargs)) + + def date(*args, **kwargs) -> datetime: return typing.cast(datetime, Date(*args, **kwargs)) diff --git a/assemblyline/odm/base.py b/assemblyline/odm/base.py index 6a3ff54ff..7b5928626 100644 --- a/assemblyline/odm/base.py +++ b/assemblyline/odm/base.py @@ -19,6 +19,7 @@ import sys import unicodedata from datetime import datetime +import typing from typing import Any as _Any from typing import Dict, Tuple, Union @@ -290,6 +291,31 @@ def check(self, value, **kwargs): return str(value) +class Wildcard(Keyword): + """ + A keyword with enhanced indexing to support more complex queries. + """ + + def check(self, value, **kwargs): + if self.optional and value is None: + return None + + # We have a special case for bytes here due to how often strings and bytes + # get mixed up in python apis + if isinstance(value, bytes): + raise ValueError(f"[{self.name or self.parent_name}] Keyword doesn't accept bytes values") + + if value == '' or value is None: + if self.default_set: + value = self.default + else: + raise ValueError(f"[{self.name or self.parent_name}] Empty strings are not allowed without defaults") + + if value is None: + return None + + return str(value) + class EmptyableKeyword(_Field): """ A keyword which allow to differentiate between empty and None values. @@ -638,9 +664,9 @@ def check(self, value, **kwargs): class Integer(_Field): - """A field storing an integer value.""" + """A field storing a signed 32 bit integer value.""" - def __init__(self, max: int = None, min: int = None, *args, **kwargs): + def __init__(self, max: typing.Optional[int] = None, min: typing.Optional[int] = None, *args, **kwargs): super().__init__(*args, **kwargs) self.max = max self.min = min @@ -668,35 +694,8 @@ def check(self, value, **kwargs): return ret_val -class Long(_Field): - """A field storing an integer value.""" - - def __init__(self, max: int = None, min: int = None, *args, **kwargs): - super().__init__(*args, **kwargs) - self.max = max - self.min = min - - def check(self, value, **kwargs): - if self.optional and value is None: - return None - - if value is None or value == "": - if self.default_set: - ret_val = self.default - else: - raise ValueError(f"[{self.name or self.parent_name}] No value provided and no default value set.") - else: - ret_val = int(value) - - # Test min/max - if self.max is not None and ret_val > self.max: - raise ValueError( - f"[{self.name or self.parent_name}] Value bigger then the max value. ({value} > {self.max})") - if self.min is not None and ret_val < self.min: - raise ValueError( - f"[{self.name or self.parent_name}] Value smaller then the min value. ({value} < {self.max})") - - return ret_val +class Long(Integer): + """A field storing a signed 64 bit integer value.""" class Float(_Field): diff --git a/assemblyline/odm/models/alert.py b/assemblyline/odm/models/alert.py index affb0a8a6..e0763ed6a 100644 --- a/assemblyline/odm/models/alert.py +++ b/assemblyline/odm/models/alert.py @@ -115,7 +115,7 @@ class File(odm.Model): name = odm.Keyword(copyto="__text__", description="The original name of the file as submitted.") sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file.") sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file.") - size = odm.Integer(store=False, description="The size of the file in bytes.") + size = odm.long(store=False, description="The size of the file in bytes.") type = odm.Keyword(copyto="__text__", description=" The file type as identified by Assemblyline's analysis.") screenshots = odm.List(odm.Compound(Screenshot), default=[], description="Screenshots taken of the file during analysis, if applicable.") @@ -202,7 +202,7 @@ class Alert(odm.Model): filtered = odm.Boolean(default=False, description="Indicates whether portions of the submission's analysis results have been omitted due to the user's classification level not meeting the required threshold for viewing certain data.") heuristic = odm.Compound(Heuristic, description="Data regarding the heuristics that triggered the alert.") label = odm.List(odm.Keyword(), copyto="__text__", default=[], description="Labels assigned to the alert for categorization and filtering.") - metadata = odm.FlattenedObject(default={}, store=False, description="Additional metadata provided with the file at the time of submission.") + metadata = odm.Mapping(odm.wildcard(), copyto="__text__", default={}, store=False, description="Additional metadata provided with the file at the time of submission.") owner = odm.Optional(odm.Keyword(), description="Specifies the user or system component that has taken ownership of the alert. If no user has claimed the alert, it remains under system ownership with no specific user associated, indicated by a value of `None`.") priority = odm.Optional(odm.Enum(values=PRIORITIES), description="Indicates the importance level assigned to the alert.") reporting_ts = odm.Date(description="Timestamp when the alert was created.") diff --git a/assemblyline/odm/models/badlist.py b/assemblyline/odm/models/badlist.py index 117f6c08b..a557bfd5e 100644 --- a/assemblyline/odm/models/badlist.py +++ b/assemblyline/odm/models/badlist.py @@ -30,7 +30,7 @@ class Hashes(odm.Model): class File(odm.Model): name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[], description="List of names seen for that file") - size = odm.Optional(odm.Integer(), description="Size of the file in bytes") + size = odm.Optional(odm.long(), description="Size of the file in bytes") type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/config.py b/assemblyline/odm/models/config.py index b9d767a1f..25f3fced0 100644 --- a/assemblyline/odm/models/config.py +++ b/assemblyline/odm/models/config.py @@ -2014,7 +2014,7 @@ class Submission(odm.Model): emptyresult_dtl: int = odm.Integer(min=0, description="Number of days emptyresult will remain in the system") max_dtl: int = odm.Integer(min=0, description="Maximum number of days submissions will remain in the system") max_extraction_depth: int = odm.Integer(description="Maximum files extraction depth") - max_file_size: int = odm.Integer(description="Maximum size for files submitted in the system") + max_file_size: int = odm.long(description="Maximum size for files submitted in the system") max_metadata_length: int = odm.Integer(description="Maximum length for each metadata values") max_temp_data_length: int = odm.Integer(description="Maximum length for each temporary data values") metadata: MetadataConfig = odm.Compound(MetadataConfig, default=DEFAULT_METADATA_CONFIGURATION, diff --git a/assemblyline/odm/models/file.py b/assemblyline/odm/models/file.py index aa433b086..1948a6e00 100644 --- a/assemblyline/odm/models/file.py +++ b/assemblyline/odm/models/file.py @@ -102,7 +102,7 @@ class File(odm.Model): seen = odm.Compound(Seen, default={}, description="Records the frequency and timestamps of when the file was encountered.", ai=False) sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file, providing a more secure alternative to MD5 for integrity checks.", ai=False) sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file, offering a high level of security for integrity verification.") - size = odm.Integer(description="Size of the file in bytes.") + size = odm.long(description="Size of the file in bytes.") ssdeep = odm.SSDeepHash(store=False, description="The fuzzy hash of the file using SSDEEP, which is useful for identifying similar files.", ai=False) type = odm.Keyword(copyto="__text__", description="The file type as determined by the AssemblyLine file type identification service.") tlsh = odm.Optional(odm.Keyword(copyto="__text__"), description="A locality-sensitive hash (TLSH) of the file's content, useful for similarity comparisons.", ai=False) diff --git a/assemblyline/odm/models/safelist.py b/assemblyline/odm/models/safelist.py index 0d6329efe..81900e8d0 100644 --- a/assemblyline/odm/models/safelist.py +++ b/assemblyline/odm/models/safelist.py @@ -17,7 +17,7 @@ class Hashes(odm.Model): class File(odm.Model): name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[], description="List of names seen for that file") - size = odm.Optional(odm.Integer(), description="Size of the file in bytes") + size = odm.Optional(odm.long(), description="Size of the file in bytes") type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/signature.py b/assemblyline/odm/models/signature.py index 341ffed25..923795291 100644 --- a/assemblyline/odm/models/signature.py +++ b/assemblyline/odm/models/signature.py @@ -14,7 +14,7 @@ @odm.model(index=True, store=True) class Signature(odm.Model): classification = odm.Classification(store=True, default=Classification.UNRESTRICTED) - data = odm.Text(index=False, store=False) + data = odm.Text(copyto="__text__", store=False) last_modified = odm.Date(default="NOW") name = odm.Keyword(copyto="__text__") order = odm.Integer(default=1, store=False) diff --git a/assemblyline/odm/models/submission.py b/assemblyline/odm/models/submission.py index 23615a4a0..8f2f6afb4 100644 --- a/assemblyline/odm/models/submission.py +++ b/assemblyline/odm/models/submission.py @@ -12,7 +12,7 @@ @odm.model(index=True, store=False, description="File Model of Submission") class File(odm.Model): name = odm.Keyword(copyto="__text__", description="Name of the file") - size = odm.Optional(odm.Integer(), description="Size of the file in bytes") + size = odm.Optional(odm.long(), description="Size of the file in bytes") sha256 = odm.SHA256(copyto="__text__", description="SHA256 hash of the file") @@ -140,9 +140,9 @@ class Submission(odm.Model): file_count = odm.Integer(description="Total number of files in the submission", ai=False) files: list[File] = odm.List(odm.Compound(File), description="List of files that were originally submitted") max_score = odm.Integer(description="Maximum score of all the files in the scan") - metadata = odm.FlattenedObject(store=False, description="Metadata associated to the submission") + metadata = odm.Mapping(odm.wildcard(), copyto="__text__", store=False, description="Metadata associated to the submission") params: SubmissionParams = odm.Compound(SubmissionParams, description="Submission parameter details", ai=False) - results: list[str] = odm.List(odm.Keyword(), store=False, description="List of result keys", ai=False) + results: list[str] = odm.List(odm.wildcard(), store=False, description="List of result keys", ai=False) sid: str = odm.UUID(copyto="__text__", description="Submission ID") state = odm.Enum(values=SUBMISSION_STATES, description="Status of the submission", ai=False) to_be_deleted = odm.Boolean( diff --git a/test/test_odm_mapping.py b/test/test_odm_mapping.py new file mode 100644 index 000000000..236047163 --- /dev/null +++ b/test/test_odm_mapping.py @@ -0,0 +1,83 @@ +from assemblyline import odm +from assemblyline.datastore.collection import ESCollection +from assemblyline.datastore.support.build import build_mapping + + +@odm.model(index=True) +class OdmTestMapping1(odm.Model): + stable_text_field = odm.keyword() + swapped_text_field = odm.keyword() + stable_number_field = odm.integer() + swapped_number_field = odm.integer() + + +@odm.model(index=True) +class OdmTestMapping2(odm.Model): + stable_text_field = odm.keyword() + swapped_text_field = odm.wildcard() + stable_number_field = odm.integer() + swapped_number_field = odm.long() + + +def test_example_mapping_type(): + """Test that the example models produce the expected mapping types""" + properties, dynamic = build_mapping(OdmTestMapping1.fields().values()) + + # There should be no dynamic mappings, just one rule forbidding implicit mappings + assert len(dynamic) == 1 + assert 'refuse_all_implicit_mappings' in dynamic[0] + + # Check that the static fields have the mapping type we want + assert len(properties) == 4 + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'keyword' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'integer' + + properties, dynamic = build_mapping(OdmTestMapping2.fields().values()) + + # There should be no dynamic mappings, just one rule forbidding implicit mappings + assert len(dynamic) == 1 + assert 'refuse_all_implicit_mappings' in dynamic[0] + + # Check that the static fields have the mapping type we want + assert len(properties) == 4 + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'wildcard' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'long' + + +def test_field_upgrade_ok(datastore_connection): + """Test that changing a field from keyword to wildcard doesn't break anything.""" + # Clean up from any previous runs + collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=False) + collection.wipe(recreate=False) + + # Create the collection in elastic + collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=True) + properties = collection.fields() + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'keyword' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'integer' + + # Open that same collection using the new mapping + collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping2, validate=True) + + # Check that the fields haven't changed + properties = collection.fields() + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'keyword' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'integer' + + # Reindex + collection.reindex() + + # Check that the fields match the new model + properties = collection.fields() + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'wildcard' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'long'