Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand mapping types #1894

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions assemblyline/datastore/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
import typing
import warnings

from copy import deepcopy
from assemblyline.common.isotime import now_as_iso
from datemath import dm
from datemath.helpers import DateMathException
from datetime import datetime
from enum import Enum
from os import environ
from typing import Dict, Any, Union, TypeVar, Generic
from typing import Dict, Any, Union, TypeVar, Generic, Optional
from copy import deepcopy

from datemath import dm
from datemath.helpers import DateMathException
import elasticsearch
import elasticsearch.helpers

from assemblyline import odm
from assemblyline.common.isotime import now_as_iso
from assemblyline.common.dict_utils import recursive_update
from assemblyline.datastore.bulk import ElasticBulkPlan
from assemblyline.datastore.exceptions import (
Expand Down Expand Up @@ -222,8 +222,8 @@ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, ar
if field.store:
self.stored_fields[name] = field

def is_archive_index(self, index):
return self.archive_name and index.startswith(self.archive_name)
def is_archive_index(self, index) -> bool:
return bool(self.archive_name and index.startswith(self.archive_name))

def get_index_list(self, index_type):
# Default value
Expand Down Expand Up @@ -2004,7 +2004,7 @@ def _get_index_mappings(self) -> dict:
if self.model_class:
mappings['properties'], mappings['dynamic_templates'] = \
build_mapping(self.model_class.fields().values())
mappings['dynamic_templates'].insert(0, default_dynamic_strings)
mappings['dynamic_templates'].append(default_dynamic_strings)
else:
mappings['dynamic_templates'] = deepcopy(default_dynamic_templates)

Expand Down Expand Up @@ -2032,17 +2032,17 @@ def __get_possible_fields(self, field):

return field_types

def _check_fields(self, model=None):
def _check_fields(self, target_model: Optional[odm.Model] = None):
if not self.validate:
return

if model is None:
if target_model is None:
if self.model_class:
return self._check_fields(self.model_class)
return

fields = self.fields()
model = self.model_class.flat_fields(skip_mappings=True)
model = target_model.flat_fields(skip_mappings=True)

missing = set(model.keys()) - set(fields.keys())
if missing:
Expand Down Expand Up @@ -2071,15 +2071,15 @@ def _ensure_collection(self):
index = f"{alias}_hot"
# Create HOT index
if not self.with_retries(self.datastore.client.indices.exists, index=alias):
log.debug(f"Index {alias.upper()} does not exists. Creating it now...")
log.debug("Index %s does not exists. Creating it now...", alias.upper())
try:
self.with_retries(self.datastore.client.indices.create, index=index,
mappings=self._get_index_mappings(),
settings=self._get_index_settings(archive=self.is_archive_index(index)))
except elasticsearch.exceptions.RequestError as e:
if "resource_already_exists_exception" not in str(e):
raise
log.warning(f"Tried to create an index template that already exists: {alias.upper()}")
log.warning("Tried to create an index template that already exists: %s", alias.upper())

self.with_retries(self.datastore.client.indices.put_alias, index=index, name=alias)
elif not self.with_retries(self.datastore.client.indices.exists, index=index) and \
Expand Down
16 changes: 13 additions & 3 deletions assemblyline/datastore/support/build.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from assemblyline.odm.base import _Field
from assemblyline.odm import Keyword, Text, List, Compound, Date, Integer, Long, \
from assemblyline.odm import Keyword, Wildcard, Text, List, Compound, Date, Integer, Long, \
Float, Boolean, Mapping, Classification, Enum, Any, UUID, Optional, IP, Domain, URI, URIPath, MAC, PhoneNumber, \
SSDeepHash, SHA1, SHA256, MD5, Platform, Processor, ClassificationString, FlattenedObject, Email, UpperKeyword, \
Json, ValidatedKeyword, UNCPath

# Simple types can be resolved by a direct mapping
__type_mapping = {
Keyword: 'keyword',
Wildcard: 'wildcard',
Boolean: 'boolean',
Integer: 'integer',
Long: 'long',
Expand Down Expand Up @@ -111,6 +112,11 @@ def set_mapping(temp_field: _Field, body):
"analyzer": __analyzer_mapping[field.__class__]
})

elif isinstance(field, Wildcard):
es_data_type = __type_mapping[field.__class__]
data = {'type': es_data_type}
mappings[name.strip(".")] = data

elif isinstance(field, Keyword):
es_data_type = __type_mapping[field.__class__]
data = {'type': es_data_type}
Expand Down Expand Up @@ -198,14 +204,18 @@ def build_templates(name, field, nested_template=False, index=True) -> list:

return [{f"nested_{name}": main_template}]
else:
mapping = __type_mapping[field.__class__]
field_template = {
"path_match": name,
"mapping": {
"type": __type_mapping[field.__class__],
"type": mapping,
}
}

field_template['mapping']['index'] = field.index
# Wildcard doesn't suport setting index, its _always_ indexed
if mapping != 'wildcard':
field_template['mapping']['index'] = field.index

if field.copyto:
assert len(field.copyto) == 1
field_template['mapping']['copy_to'] = field.copyto[0]
Expand Down
7 changes: 6 additions & 1 deletion assemblyline/odm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

# Imports that have the same effect as some part of the one above so that
# type checking can use this file properly.
from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, Date, Long, Enum
from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, \
Date, Long, Enum, Wildcard
from datetime import datetime

_InnerType = typing.TypeVar("_InnerType")
Expand All @@ -27,6 +28,10 @@ def keyword(*args, **kwargs) -> str:
return typing.cast(str, Keyword(*args, **kwargs))


def wildcard(*args, **kwargs) -> str:
return typing.cast(str, Wildcard(*args, **kwargs))


def date(*args, **kwargs) -> datetime:
return typing.cast(datetime, Date(*args, **kwargs))

Expand Down
61 changes: 30 additions & 31 deletions assemblyline/odm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import sys
import unicodedata
from datetime import datetime
import typing
from typing import Any as _Any
from typing import Dict, Tuple, Union

Expand Down Expand Up @@ -290,6 +291,31 @@ def check(self, value, **kwargs):
return str(value)


class Wildcard(Keyword):
"""
A keyword with enhanced indexing to support more complex queries.
"""

def check(self, value, **kwargs):
if self.optional and value is None:
return None

# We have a special case for bytes here due to how often strings and bytes
# get mixed up in python apis
if isinstance(value, bytes):
raise ValueError(f"[{self.name or self.parent_name}] Keyword doesn't accept bytes values")

if value == '' or value is None:
if self.default_set:
value = self.default
else:
raise ValueError(f"[{self.name or self.parent_name}] Empty strings are not allowed without defaults")

if value is None:
return None

return str(value)

class EmptyableKeyword(_Field):
"""
A keyword which allow to differentiate between empty and None values.
Expand Down Expand Up @@ -638,9 +664,9 @@ def check(self, value, **kwargs):


class Integer(_Field):
"""A field storing an integer value."""
"""A field storing a signed 32 bit integer value."""

def __init__(self, max: int = None, min: int = None, *args, **kwargs):
def __init__(self, max: typing.Optional[int] = None, min: typing.Optional[int] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max = max
self.min = min
Expand Down Expand Up @@ -668,35 +694,8 @@ def check(self, value, **kwargs):
return ret_val


class Long(_Field):
"""A field storing an integer value."""

def __init__(self, max: int = None, min: int = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max = max
self.min = min

def check(self, value, **kwargs):
if self.optional and value is None:
return None

if value is None or value == "":
if self.default_set:
ret_val = self.default
else:
raise ValueError(f"[{self.name or self.parent_name}] No value provided and no default value set.")
else:
ret_val = int(value)

# Test min/max
if self.max is not None and ret_val > self.max:
raise ValueError(
f"[{self.name or self.parent_name}] Value bigger then the max value. ({value} > {self.max})")
if self.min is not None and ret_val < self.min:
raise ValueError(
f"[{self.name or self.parent_name}] Value smaller then the min value. ({value} < {self.max})")

return ret_val
class Long(Integer):
"""A field storing a signed 64 bit integer value."""


class Float(_Field):
Expand Down
2 changes: 1 addition & 1 deletion assemblyline/odm/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2014,7 +2014,7 @@ class Submission(odm.Model):
emptyresult_dtl: int = odm.Integer(min=0, description="Number of days emptyresult will remain in the system")
max_dtl: int = odm.Integer(min=0, description="Maximum number of days submissions will remain in the system")
max_extraction_depth: int = odm.Integer(description="Maximum files extraction depth")
max_file_size: int = odm.Integer(description="Maximum size for files submitted in the system")
max_file_size: int = odm.long(description="Maximum size for files submitted in the system")
max_metadata_length: int = odm.Integer(description="Maximum length for each metadata values")
max_temp_data_length: int = odm.Integer(description="Maximum length for each temporary data values")
metadata: MetadataConfig = odm.Compound(MetadataConfig, default=DEFAULT_METADATA_CONFIGURATION,
Expand Down
125 changes: 125 additions & 0 deletions test/test_odm_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from assemblyline import odm
from assemblyline.datastore.collection import ESCollection
from assemblyline.datastore.support.build import build_mapping


@odm.model(index=True)
class OdmTestMapping1(odm.Model):
stable_text_field = odm.keyword()
swapped_text_field = odm.keyword()
stable_number_field = odm.integer()
swapped_number_field = odm.integer()


@odm.model(index=True)
class OdmTestMapping2(odm.Model):
stable_text_field = odm.keyword()
swapped_text_field = odm.wildcard()
stable_number_field = odm.integer()
swapped_number_field = odm.long()


def test_example_mapping_type():
"""Test that the example models produce the expected mapping types"""
properties, dynamic = build_mapping(OdmTestMapping1.fields().values())

# There should be no dynamic mappings, just one rule forbidding implicit mappings
assert len(dynamic) == 1
assert 'refuse_all_implicit_mappings' in dynamic[0]

# Check that the static fields have the mapping type we want
assert len(properties) == 4
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'keyword'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'integer'

properties, dynamic = build_mapping(OdmTestMapping2.fields().values())

# There should be no dynamic mappings, just one rule forbidding implicit mappings
assert len(dynamic) == 1
assert 'refuse_all_implicit_mappings' in dynamic[0]

# Check that the static fields have the mapping type we want
assert len(properties) == 4
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'wildcard'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'long'


def test_field_upgrade_ok(datastore_connection):
"""Test that changing a field from keyword to wildcard doesn't break anything."""
# Clean up from any previous runs
collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=False)
collection.wipe(recreate=False)

# Create the collection in elastic
collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=True)
properties = collection.fields()
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'keyword'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'integer'

# Open that same collection using the new mapping
collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping2, validate=True)

# Check that the fields haven't changed
properties = collection.fields()
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'keyword'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'integer'

# Reindex
collection.reindex()

# Check that the fields match the new model
properties = collection.fields()
assert properties['stable_text_field']['type'] == 'keyword'
assert properties['swapped_text_field']['type'] == 'wildcard'
assert properties['stable_number_field']['type'] == 'integer'
assert properties['swapped_number_field']['type'] == 'long'


def test_metadata_indexing(datastore_connection):

@odm.model(index=True)
class TestMapping(odm.Model):
metadata = odm.Mapping(odm.wildcard(copyto='__text__'))

# Clean up from any previous runs
collection = ESCollection(datastore_connection.ds, "test_metadata_indexing", TestMapping, validate=False)
collection.wipe(recreate=False)

print(build_mapping(TestMapping.fields().values()))

# Create with new mapping configuration
collection = ESCollection(datastore_connection.ds, "test_metadata_indexing", TestMapping, validate=True)

# Insert data to trigger dynamic field creation
collection.save("1", {"metadata": {'field1': 123}})
collection.save("2", {"metadata": {'field2': "123"}})
collection.save("3", {"metadata": {'field3': {'subfield': "cat dog cat"}}})
collection.save("4", {"metadata": {'address': "https://cyber.gc.ca"}})
collection.commit()

# Check if those fields are the type and config we want
fields = collection.fields()
fields.pop('id')

assert len(fields) == 4
for field_name, field in fields.items():
assert field['type'] == 'wildcard', (field_name, field)
assert field['indexed']
assert field['default'], (field_name, field)

# Check that copyto and regex work
search = collection.search("cyber.gc.ca")
assert search['total'] == 1
assert search['items'][0].id == "4"

search = collection.search("address: /http[s]://cyber\\.(gc\\.ca|com)/")
assert search['total'] == 1
assert search['items'][0].id == "4"