From e752edf1266dcb310cfcaf20821e1e84623a8963 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Mon, 17 Feb 2025 17:36:01 +0000 Subject: [PATCH] two different bugfixes squished togeather --- assemblyline/datastore/support/build.py | 62 +++++--- assemblyline/odm/base.py | 13 ++ test/test_odm.py | 2 + test/test_odm_build_mapping.py | 190 ++++++++++++++++++++++++ 4 files changed, 247 insertions(+), 20 deletions(-) create mode 100644 test/test_odm_build_mapping.py diff --git a/assemblyline/datastore/support/build.py b/assemblyline/datastore/support/build.py index 4f713100b..c9a9cc6ba 100644 --- a/assemblyline/datastore/support/build.py +++ b/assemblyline/datastore/support/build.py @@ -54,7 +54,7 @@ back_mapping.update({x: Keyword for x in set(__analyzer_mapping.values())}) -def build_mapping(field_data, prefix=None, allow_refuse_implicit=True): +def build_mapping(field_data, prefix=None, allow_refuse_implicit=True, default_copyto=None): """ The mapping for Elasticsearch based on a python model object. """ @@ -72,6 +72,8 @@ def set_mapping(temp_field: _Field, body): if temp_field.copyto: assert len(temp_field.copyto) == 1 body['copy_to'] = temp_field.copyto[0] + elif default_copyto is not None and default_copyto: + body['copy_to'] = default_copyto[0] return body @@ -127,13 +129,14 @@ def set_mapping(temp_field: _Field, body): }) elif isinstance(field, FlattenedObject): - if not field.index or isinstance(field.child_type, Any): + if not any_indexed_part(field) or isinstance(field.child_type, Any): mappings[name.strip(".")] = {"type": "object", "enabled": False} else: dynamic.extend(build_templates(f'{name}.*', field.child_type, nested_template=True, index=field.index)) elif isinstance(field, List): temp_mappings, temp_dynamic = build_mapping([field.child_type], prefix=path, + default_copyto=field.copyto, allow_refuse_implicit=False) mappings.update(temp_mappings) dynamic.extend(temp_dynamic) @@ -151,7 +154,7 @@ def set_mapping(temp_field: _Field, body): dynamic.extend(temp_dynamic) elif isinstance(field, Mapping): - if not field.index or isinstance(field.child_type, Any): + if not any_indexed_part(field) or isinstance(field.child_type, Any): mappings[name.strip(".")] = {"type": "object", "enabled": False} else: dynamic.extend(build_templates(f'{name}.*', field.child_type, index=field.index)) @@ -186,6 +189,25 @@ def set_mapping(temp_field: _Field, body): return mappings, dynamic +def any_indexed_part(field) -> bool: + """Figure out if any component of this field is indexed.""" + if isinstance(field, (FlattenedObject, List, Optional, Mapping)): + if field.index is not None: + return field.index + return any_indexed_part(field.child_type) + + elif isinstance(field, Compound): + if field.index is not None: + return field.index + for subfield in field.fields().values(): + if any_indexed_part(subfield): + return True + return False + + else: + return field.index + + def build_templates(name, field, nested_template=False, index=True) -> list: if isinstance(field, (Keyword, Boolean, Integer, Float, Text, Json)): if nested_template: @@ -212,29 +234,16 @@ def build_templates(name, field, nested_template=False, index=True) -> list: return [{f"{name}_tpl": field_template}] - elif isinstance(field, Any) or not index: - field_template = { - "path_match": name, - "mapping": { - "type": "keyword", - "index": False - } - } - - if field.index: - raise ValueError(f"Mapping to Any may not be indexed: {name}") - return [{f"{name}_tpl": field_template}] - elif isinstance(field, (Mapping, List)): temp_name = name - if field.name: - temp_name = f"{name}.{field.name}" + # if field.name: + # temp_name = f"{name}.{field.name}" return build_templates(temp_name, field.child_type, nested_template=True) elif isinstance(field, Compound): temp_name = name - if field.name: - temp_name = f"{name}.{field.name}" + # if field.name: + # temp_name = f"{name}.{field.name}" out = [] for sub_name, sub_field in field.fields().items(): @@ -246,5 +255,18 @@ def build_templates(name, field, nested_template=False, index=True) -> list: elif isinstance(field, Optional): return build_templates(name, field.child_type, nested_template=nested_template) + elif isinstance(field, Any) or not index: + field_template = { + "path_match": name, + "mapping": { + "type": "keyword", + "index": False + } + } + + if field.index: + raise ValueError(f"Mapping to Any may not be indexed: {name}") + return [{f"{name}_tpl": field_template}] + else: raise NotImplementedError(f"Unknown type for elasticsearch dynamic mapping: {field.__class__}") diff --git a/assemblyline/odm/base.py b/assemblyline/odm/base.py index 6a3ff54ff..af374455b 100644 --- a/assemblyline/odm/base.py +++ b/assemblyline/odm/base.py @@ -200,6 +200,16 @@ def apply_defaults(self, index, store): if self.store is None: self.store = store + def inherit_parameters(self, other): + if self.index is None: + self.index = other.index + if self.store is None: + self.store = other.store + if self.ai is None: + self.ai = other.ai + if len(self.copyto) == 0: + self.copyto = other.copyto + def fields(self): """ Return the subfields/modified field data. @@ -863,6 +873,7 @@ def __init__(self, child_type, auto=False, **kwargs): super().__init__(**kwargs) self.child_type = child_type self.auto = auto + self.inherit_parameters(self.child_type) def check(self, value, **kwargs): if self.optional and value is None: @@ -957,6 +968,7 @@ def __init__(self, child_type, **kwargs): super().__init__(**kwargs) self.child_type = child_type + self.inherit_parameters(child_type) def check(self, value, **kwargs): if self.optional and value is None: @@ -1056,6 +1068,7 @@ def __init__(self, child_type, **kwargs): self.default_set = True child_type.optional = True self.child_type = child_type + self.inherit_parameters(child_type) def check(self, value, *args, **kwargs): if value is None: diff --git a/test/test_odm.py b/test/test_odm.py index 3ba0ca4ba..e9b861c23 100644 --- a/test/test_odm.py +++ b/test/test_odm.py @@ -899,3 +899,5 @@ class Outer(Model): assert {k: v.multivalued for k, v in Outer.flat_fields(show_compound=True, skip_mappings=True).items()} == { "c": True, "c.a": False, "c.b": False} + + diff --git a/test/test_odm_build_mapping.py b/test/test_odm_build_mapping.py new file mode 100644 index 000000000..6e37b0091 --- /dev/null +++ b/test/test_odm_build_mapping.py @@ -0,0 +1,190 @@ +from assemblyline import odm +from assemblyline.datastore.support.build import build_mapping + + +def test_simple_fields(): + @odm.model() + class Inner(odm.Model): + text = odm.Text() + key = odm.keyword() + index_key = odm.keyword(index=True) + no_index_key = odm.keyword(index=False) + + @odm.model(index=True) + class NestAlwaysIndex(odm.Model): + a = odm.compound(Inner) + b = odm.compound(Inner, index=True) + c = odm.compound(Inner, index=False) + + @odm.model() + class Outer(odm.Model): + a = odm.compound(Inner) + b = odm.compound(Inner, index=True) + c = odm.compound(Inner, index=False) + d = odm.compound(NestAlwaysIndex) + e = odm.compound(NestAlwaysIndex, index=False) + + # Build the mappings + static, dynamic = build_mapping(Outer.fields().values()) + + # Check the static fields + FIELDS = { + 'a.': [ + ("text", "text", None), + ("key", "keyword", None), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'b.': [ + ("text", "text", True), + ("key", "keyword", True), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'c.': [ + ("text", "text", False), + ("key", "keyword", False), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'd.a.': [ + ("text", "text", True), + ("key", "keyword", True), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'd.b.': [ + ("text", "text", True), + ("key", "keyword", True), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'd.c.': [ + ("text", "text", False), + ("key", "keyword", False), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'e.a.': [ + ("text", "text", True), + ("key", "keyword", True), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'e.b.': [ + ("text", "text", True), + ("key", "keyword", True), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + 'e.c.': [ + ("text", "text", False), + ("key", "keyword", False), + ("index_key", "keyword", True), + ("no_index_key", "keyword", False), + ], + } + + for prefix, fields in FIELDS.items(): + for name, type_, indexed in fields: + field = static.pop(prefix + name) + assert field['type'] == type_ + assert field['index'] is indexed, prefix + name + assert len(static) == 0 + + # Make sure there are no dynamic fields + assert len(dynamic) == 1 + assert list(dynamic[0].keys()) == ['refuse_all_implicit_mappings'] + + +def test_dynamic_fields(): + @odm.model() + class Inner(odm.Model): + text = odm.Text() + key = odm.keyword() + index_key = odm.keyword(index=True) + no_index_key = odm.keyword(index=False) + + @odm.model(index=True) + class NestAlwaysIndex(odm.Model): + a = odm.compound(Inner) + b = odm.compound(Inner, index=True) + c = odm.compound(Inner, index=False) + + # Only mappings where the mapping itself is marked for indexing will have its subfields indexed + @odm.model(index=True) + class Outer(odm.Model): + a = odm.mapping(odm.compound(Inner), index=False) + b = odm.mapping(odm.compound(Inner, index=True)) + c = odm.mapping(odm.compound(Inner, index=False)) + d = odm.mapping(odm.compound(NestAlwaysIndex), index=False) + e = odm.mapping(odm.compound(NestAlwaysIndex, index=False)) + f = odm.mapping(odm.integer()) + + # Build the mappings + static, dynamic = build_mapping(Outer.fields().values()) + + # make sure the static lines corresponding to mappings are disabled + for name in ['a', 'c', 'd', 'e']: + field = static.pop(name) + assert field['enabled'] is False + assert field['type'] == 'object' + assert len(static) == 0, static + + # Make sure there are dynamic rules for the expected fields + rules = {} + for row in dynamic: + for key, config in row.items(): + assert key not in rules + rules[key] = config + assert rules == { + 'b.*.text_tpl': {'mapping': {'index': True, 'type': 'text'}, 'path_match': 'b.*.text'}, + 'b.*.key_tpl': {'mapping': {'index': True, 'type': 'keyword'}, 'path_match': 'b.*.key'}, + 'b.*.index_key_tpl': {'mapping': {'index': True, 'type': 'keyword'}, 'path_match': 'b.*.index_key'}, + 'b.*.no_index_key_tpl': {'mapping': {'index': False, 'type': 'keyword'}, 'path_match': 'b.*.no_index_key'}, + 'f.*_tpl': {'mapping': {'index': True, 'type': 'integer'}, 'path_match': 'f.*'}, + } + + +def test_dynamic_fields_simple(): + """Simplified version of the above test for checking some more common cases""" + @odm.model() + class InnerNone(odm.Model): + a = odm.keyword() + b = odm.keyword(index=True) + + @odm.model(index=True) + class InnerTrue(odm.Model): + a = odm.keyword() + b = odm.keyword(index=True) + + @odm.model() + class Outer(odm.Model): + a = odm.mapping(odm.compound(InnerNone), index=True) + b = odm.mapping(odm.compound(InnerNone, index=True)) + c = odm.mapping(odm.compound(InnerNone)) + d = odm.mapping(odm.compound(InnerTrue)) + + # Build the mappings + static, dynamic = build_mapping(Outer.fields().values()) + + # There shouldn't be any static mappings + assert len(static) == 0 + + # Make sure there are dynamic rules for the expected fields + rules = {} + for row in dynamic: + for key, config in row.items(): + assert key not in rules + rules[key] = config + assert rules == { + 'a.*.a_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'a.*.a'}, + 'a.*.b_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'a.*.b'}, + 'b.*.a_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'b.*.a'}, + 'b.*.b_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'b.*.b'}, + # All the same exept this one + 'c.*.a_tpl': {'mapping': {'type': 'keyword', 'index': None}, 'path_match': 'c.*.a'}, + 'c.*.b_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'c.*.b'}, + 'd.*.a_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'd.*.a'}, + 'd.*.b_tpl': {'mapping': {'type': 'keyword', 'index': True}, 'path_match': 'd.*.b'}, + }