Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes to the mappings #1898

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions assemblyline/datastore/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
import typing
import warnings

from copy import deepcopy
from assemblyline.common.isotime import now_as_iso
from datemath import dm
from datemath.helpers import DateMathException
from datetime import datetime
from enum import Enum
from os import environ
from typing import Dict, Any, Union, TypeVar, Generic
from typing import Dict, Any, Union, TypeVar, Generic, Optional
from copy import deepcopy

from datemath import dm
from datemath.helpers import DateMathException
import elasticsearch
import elasticsearch.helpers

from assemblyline import odm
from assemblyline.common.isotime import now_as_iso
from assemblyline.common.dict_utils import recursive_update
from assemblyline.datastore.bulk import ElasticBulkPlan
from assemblyline.datastore.exceptions import (
Expand Down Expand Up @@ -222,8 +222,8 @@ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, ar
if field.store:
self.stored_fields[name] = field

def is_archive_index(self, index):
return self.archive_name and index.startswith(self.archive_name)
def is_archive_index(self, index) -> bool:
return bool(self.archive_name and index.startswith(self.archive_name))

def get_index_list(self, index_type):
# Default value
Expand Down Expand Up @@ -2032,17 +2032,17 @@ def __get_possible_fields(self, field):

return field_types

def _check_fields(self, model=None):
def _check_fields(self, target_model: Optional[odm.Model] = None):
if not self.validate:
return

if model is None:
if target_model is None:
if self.model_class:
return self._check_fields(self.model_class)
return

fields = self.fields()
model = self.model_class.flat_fields(skip_mappings=True)
model = target_model.flat_fields(skip_mappings=True)

missing = set(model.keys()) - set(fields.keys())
if missing:
Expand Down Expand Up @@ -2071,15 +2071,15 @@ def _ensure_collection(self):
index = f"{alias}_hot"
# Create HOT index
if not self.with_retries(self.datastore.client.indices.exists, index=alias):
log.debug(f"Index {alias.upper()} does not exists. Creating it now...")
log.debug("Index %s does not exists. Creating it now...", alias.upper())
try:
self.with_retries(self.datastore.client.indices.create, index=index,
mappings=self._get_index_mappings(),
settings=self._get_index_settings(archive=self.is_archive_index(index)))
except elasticsearch.exceptions.RequestError as e:
if "resource_already_exists_exception" not in str(e):
raise
log.warning(f"Tried to create an index template that already exists: {alias.upper()}")
log.warning("Tried to create an index template that already exists: %s", alias.upper())

self.with_retries(self.datastore.client.indices.put_alias, index=index, name=alias)
elif not self.with_retries(self.datastore.client.indices.exists, index=index) and \
Expand Down
8 changes: 7 additions & 1 deletion assemblyline/datastore/support/build.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from assemblyline.odm.base import _Field
from assemblyline.odm import Keyword, Text, List, Compound, Date, Integer, Long, \
from assemblyline.odm import Keyword, Wildcard, Text, List, Compound, Date, Integer, Long, \
Float, Boolean, Mapping, Classification, Enum, Any, UUID, Optional, IP, Domain, URI, URIPath, MAC, PhoneNumber, \
SSDeepHash, SHA1, SHA256, MD5, Platform, Processor, ClassificationString, FlattenedObject, Email, UpperKeyword, \
Json, ValidatedKeyword, UNCPath

# Simple types can be resolved by a direct mapping
__type_mapping = {
Keyword: 'keyword',
Wildcard: 'wildcard',
Boolean: 'boolean',
Integer: 'integer',
Long: 'long',
Expand Down Expand Up @@ -111,6 +112,11 @@ def set_mapping(temp_field: _Field, body):
"analyzer": __analyzer_mapping[field.__class__]
})

elif isinstance(field, Wildcard):
es_data_type = __type_mapping[field.__class__]
data = {'type': es_data_type}
mappings[name.strip(".")] = data

elif isinstance(field, Keyword):
es_data_type = __type_mapping[field.__class__]
data = {'type': es_data_type}
Expand Down
7 changes: 6 additions & 1 deletion assemblyline/odm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

# Imports that have the same effect as some part of the one above so that
# type checking can use this file properly.
from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, Date, Long, Enum
from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, \
Date, Long, Enum, Wildcard
from datetime import datetime

_InnerType = typing.TypeVar("_InnerType")
Expand All @@ -27,6 +28,10 @@ def keyword(*args, **kwargs) -> str:
return typing.cast(str, Keyword(*args, **kwargs))


def wildcard(*args, **kwargs) -> str:
return typing.cast(str, Wildcard(*args, **kwargs))


def date(*args, **kwargs) -> datetime:
return typing.cast(datetime, Date(*args, **kwargs))

Expand Down
61 changes: 30 additions & 31 deletions assemblyline/odm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import sys
import unicodedata
from datetime import datetime
import typing
from typing import Any as _Any
from typing import Dict, Tuple, Union

Expand Down Expand Up @@ -290,6 +291,31 @@ def check(self, value, **kwargs):
return str(value)


class Wildcard(Keyword):
"""
A keyword with enhanced indexing to support more complex queries.
"""

def check(self, value, **kwargs):
if self.optional and value is None:
return None

# We have a special case for bytes here due to how often strings and bytes
# get mixed up in python apis
if isinstance(value, bytes):
raise ValueError(f"[{self.name or self.parent_name}] Keyword doesn't accept bytes values")

if value == '' or value is None:
if self.default_set:
value = self.default
else:
raise ValueError(f"[{self.name or self.parent_name}] Empty strings are not allowed without defaults")

if value is None:
return None

return str(value)

class EmptyableKeyword(_Field):
"""
A keyword which allow to differentiate between empty and None values.
Expand Down Expand Up @@ -638,9 +664,9 @@ def check(self, value, **kwargs):


class Integer(_Field):
"""A field storing an integer value."""
"""A field storing a signed 32 bit integer value."""

def __init__(self, max: int = None, min: int = None, *args, **kwargs):
def __init__(self, max: typing.Optional[int] = None, min: typing.Optional[int] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max = max
self.min = min
Expand Down Expand Up @@ -668,35 +694,8 @@ def check(self, value, **kwargs):
return ret_val


class Long(_Field):
"""A field storing an integer value."""

def __init__(self, max: int = None, min: int = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max = max
self.min = min

def check(self, value, **kwargs):
if self.optional and value is None:
return None

if value is None or value == "":
if self.default_set:
ret_val = self.default
else:
raise ValueError(f"[{self.name or self.parent_name}] No value provided and no default value set.")
else:
ret_val = int(value)

# Test min/max
if self.max is not None and ret_val > self.max:
raise ValueError(
f"[{self.name or self.parent_name}] Value bigger then the max value. ({value} > {self.max})")
if self.min is not None and ret_val < self.min:
raise ValueError(
f"[{self.name or self.parent_name}] Value smaller then the min value. ({value} < {self.max})")

return ret_val
class Long(Integer):
"""A field storing a signed 64 bit integer value."""


class Float(_Field):
Expand Down
4 changes: 2 additions & 2 deletions assemblyline/odm/models/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class File(odm.Model):
name = odm.Keyword(copyto="__text__", description="The original name of the file as submitted.")
sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file.")
sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file.")
size = odm.Integer(store=False, description="The size of the file in bytes.")
size = odm.long(store=False, description="The size of the file in bytes.")
type = odm.Keyword(copyto="__text__", description=" The file type as identified by Assemblyline's analysis.")
screenshots = odm.List(odm.Compound(Screenshot), default=[], description="Screenshots taken of the file during analysis, if applicable.")

Expand Down Expand Up @@ -202,7 +202,7 @@ class Alert(odm.Model):
filtered = odm.Boolean(default=False, description="Indicates whether portions of the submission's analysis results have been omitted due to the user's classification level not meeting the required threshold for viewing certain data.")
heuristic = odm.Compound(Heuristic, description="Data regarding the heuristics that triggered the alert.")
label = odm.List(odm.Keyword(), copyto="__text__", default=[], description="Labels assigned to the alert for categorization and filtering.")
metadata = odm.FlattenedObject(default={}, store=False, description="Additional metadata provided with the file at the time of submission.")
metadata = odm.Mapping(odm.wildcard(), copyto="__text__", default={}, store=False, description="Additional metadata provided with the file at the time of submission.")
owner = odm.Optional(odm.Keyword(), description="Specifies the user or system component that has taken ownership of the alert. If no user has claimed the alert, it remains under system ownership with no specific user associated, indicated by a value of `None`.")
priority = odm.Optional(odm.Enum(values=PRIORITIES), description="Indicates the importance level assigned to the alert.")
reporting_ts = odm.Date(description="Timestamp when the alert was created.")
Expand Down
2 changes: 1 addition & 1 deletion assemblyline/odm/models/badlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Hashes(odm.Model):
class File(odm.Model):
name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[],
description="List of names seen for that file")
size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
size = odm.Optional(odm.long(), description="Size of the file in bytes")
type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline")


Expand Down
2 changes: 1 addition & 1 deletion assemblyline/odm/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2014,7 +2014,7 @@ class Submission(odm.Model):
emptyresult_dtl: int = odm.Integer(min=0, description="Number of days emptyresult will remain in the system")
max_dtl: int = odm.Integer(min=0, description="Maximum number of days submissions will remain in the system")
max_extraction_depth: int = odm.Integer(description="Maximum files extraction depth")
max_file_size: int = odm.Integer(description="Maximum size for files submitted in the system")
max_file_size: int = odm.long(description="Maximum size for files submitted in the system")
max_metadata_length: int = odm.Integer(description="Maximum length for each metadata values")
max_temp_data_length: int = odm.Integer(description="Maximum length for each temporary data values")
metadata: MetadataConfig = odm.Compound(MetadataConfig, default=DEFAULT_METADATA_CONFIGURATION,
Expand Down
2 changes: 1 addition & 1 deletion assemblyline/odm/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class File(odm.Model):
seen = odm.Compound(Seen, default={}, description="Records the frequency and timestamps of when the file was encountered.", ai=False)
sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file, providing a more secure alternative to MD5 for integrity checks.", ai=False)
sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file, offering a high level of security for integrity verification.")
size = odm.Integer(description="Size of the file in bytes.")
size = odm.long(description="Size of the file in bytes.")
ssdeep = odm.SSDeepHash(store=False, description="The fuzzy hash of the file using SSDEEP, which is useful for identifying similar files.", ai=False)
type = odm.Keyword(copyto="__text__", description="The file type as determined by the AssemblyLine file type identification service.")
tlsh = odm.Optional(odm.Keyword(copyto="__text__"), description="A locality-sensitive hash (TLSH) of the file's content, useful for similarity comparisons.", ai=False)
Expand Down
2 changes: 1 addition & 1 deletion assemblyline/odm/models/safelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Hashes(odm.Model):
class File(odm.Model):
name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[],
description="List of names seen for that file")
size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
size = odm.Optional(odm.long(), description="Size of the file in bytes")
type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline")


Expand Down
2 changes: 1 addition & 1 deletion assemblyline/odm/models/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@odm.model(index=True, store=True)
class Signature(odm.Model):
classification = odm.Classification(store=True, default=Classification.UNRESTRICTED)
data = odm.Text(index=False, store=False)
data = odm.Text(copyto="__text__", store=False)
last_modified = odm.Date(default="NOW")
name = odm.Keyword(copyto="__text__")
order = odm.Integer(default=1, store=False)
Expand Down
6 changes: 3 additions & 3 deletions assemblyline/odm/models/submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
@odm.model(index=True, store=False, description="File Model of Submission")
class File(odm.Model):
name = odm.Keyword(copyto="__text__", description="Name of the file")
size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
size = odm.Optional(odm.long(), description="Size of the file in bytes")
sha256 = odm.SHA256(copyto="__text__", description="SHA256 hash of the file")


Expand Down Expand Up @@ -140,9 +140,9 @@ class Submission(odm.Model):
file_count = odm.Integer(description="Total number of files in the submission", ai=False)
files: list[File] = odm.List(odm.Compound(File), description="List of files that were originally submitted")
max_score = odm.Integer(description="Maximum score of all the files in the scan")
metadata = odm.FlattenedObject(store=False, description="Metadata associated to the submission")
metadata = odm.Mapping(odm.wildcard(), copyto="__text__", store=False, description="Metadata associated to the submission")
params: SubmissionParams = odm.Compound(SubmissionParams, description="Submission parameter details", ai=False)
results: list[str] = odm.List(odm.Keyword(), store=False, description="List of result keys", ai=False)
results: list[str] = odm.List(odm.wildcard(), store=False, description="List of result keys", ai=False)
sid: str = odm.UUID(copyto="__text__", description="Submission ID")
state = odm.Enum(values=SUBMISSION_STATES, description="Status of the submission", ai=False)
to_be_deleted = odm.Boolean(
Expand Down
83 changes: 83 additions & 0 deletions test/test_odm_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from assemblyline import odm
from assemblyline.datastore.collection import ESCollection
from assemblyline.datastore.support.build import build_mapping


@odm.model(index=True)
class OdmTestMapping1(odm.Model):
stable_text_field = odm.keyword()
swapped_text_field = odm.keyword()
stable_number_field = odm.integer()
swapped_number_field = odm.integer()


@odm.model(index=True)
class OdmTestMapping2(odm.Model):
stable_text_field = odm.keyword()
swapped_text_field = odm.wildcard()
stable_number_field = odm.integer()
swapped_number_field = odm.long()


def test_example_mapping_type():
"""Test that the example models produce the expected mapping types"""
properties, dynamic = build_mapping(OdmTestMapping1.fields().values())

# There should be no dynamic mappings, just one rule forbidding implicit mappings
assert len(dynamic) == 1
assert 'refuse_all_implicit_mappings' in dynamic[0]

# Check that the static fields have the mapping type we want
assert len(properties) == 4
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'keyword'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'integer'

properties, dynamic = build_mapping(OdmTestMapping2.fields().values())

# There should be no dynamic mappings, just one rule forbidding implicit mappings
assert len(dynamic) == 1
assert 'refuse_all_implicit_mappings' in dynamic[0]

# Check that the static fields have the mapping type we want
assert len(properties) == 4
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'wildcard'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'long'


def test_field_upgrade_ok(datastore_connection):
"""Test that changing a field from keyword to wildcard doesn't break anything."""
# Clean up from any previous runs
collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=False)
collection.wipe(recreate=False)

# Create the collection in elastic
collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=True)
properties = collection.fields()
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'keyword'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'integer'

# Open that same collection using the new mapping
collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping2, validate=True)

# Check that the fields haven't changed
properties = collection.fields()
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'keyword'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'integer'

# Reindex
collection.reindex()

# Check that the fields match the new model
properties = collection.fields()
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'wildcard'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'long'