Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate outputs for each individual subset with custom options #873

Merged
merged 13 commits into from
Jul 13, 2020
Merged
28 changes: 18 additions & 10 deletions scripts/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import glob
import os
import yaml
import time

from generators import asciidoc_fields
from generators import beats
Expand All @@ -22,15 +23,6 @@ def main():
ecs_version = read_version(args.ref)
print('Running generator. ECS version ' + ecs_version)

# To debug issues in the gradual building up of the nested structure, insert
# statements like this after any step of interest.
# ecs_helpers.yaml_dump('ecs.yml', fields)

fields = loader.load_schemas(ref=args.ref, included_files=args.include)
cleaner.clean(fields)
finalizer.finalize(fields)
fields = subset_filter.filter(fields, args.subset)

# default location to save files
out_dir = 'generated'
docs_dir = 'docs'
Expand All @@ -44,7 +36,23 @@ def main():
ecs_helpers.make_dirs(out_dir)
ecs_helpers.make_dirs(docs_dir)

nested, flat = intermediate_files.generate(fields, out_dir, default_dirs)
# To debug issues in the gradual building up of the nested structure, insert
# statements like this after any step of interest.
# ecs_helpers.yaml_dump('ecs.yml', fields)

fields = loader.load_schemas(ref=args.ref, included_files=args.include)
cleaner.clean(fields)
finalizer.finalize(fields)
subsets = subset_filter.load_subset_definitions(args.subset)
for subset in subsets:
subfields = subset_filter.extract_matching_fields(fields, subset['fields'])
intermediate_files.generate(subfields, os.path.join(out_dir, 'ecs', 'subset', subset['name']), default_dirs)

merged_subset = subset_filter.combine_all_subsets(subsets)
if merged_subset:
fields = subset_filter.extract_matching_fields(fields, merged_subset)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep all subset-related functionality like this loop inside subset_filter.py.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll refactor this


nested, flat = intermediate_files.generate(fields, os.path.join(out_dir, 'ecs'), default_dirs)
if args.intermediate_only:
exit()

Expand Down
9 changes: 4 additions & 5 deletions scripts/generators/intermediate_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@


def generate(fields, out_dir, default_dirs):
ecs_helpers.make_dirs(join(out_dir, 'ecs'))
ecs_helpers.make_dirs(join(out_dir))

# Should only be used for debugging ECS development
if default_dirs:
ecs_helpers.yaml_dump(join(out_dir, 'ecs/ecs.yml'), fields)

ecs_helpers.yaml_dump(join(out_dir, 'ecs.yml'), fields)
flat = generate_flat_fields(fields)
nested = generate_nested_fields(fields)

ecs_helpers.yaml_dump(join(out_dir, 'ecs/ecs_flat.yml'), flat)
ecs_helpers.yaml_dump(join(out_dir, 'ecs/ecs_nested.yml'), nested)
ecs_helpers.yaml_dump(join(out_dir, 'ecs_flat.yml'), flat)
ecs_helpers.yaml_dump(join(out_dir, 'ecs_nested.yml'), nested)
return nested, flat


Expand Down
2 changes: 2 additions & 0 deletions scripts/schema/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ def merge_fields(a, b):
asd['reusable']['top_level'] = bsd['reusable']['top_level']
else:
asd['reusable'].setdefault('top_level', True)
if 'order' in bsd['reusable']:
asd['reusable']['order'] = bsd['reusable']['order']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Endpoint customizations have chained reuses like group => user => other places?

In any case 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we reuse process at Target.process and since hash is reused in process we need a way to either set the order for hash to 1 or the order for process to 3

asd['reusable'].setdefault('expected', [])
asd['reusable']['expected'].extend(bsd['reusable']['expected'])
bsd.pop('reusable')
Expand Down
84 changes: 55 additions & 29 deletions scripts/schema/subset_filter.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
import glob
import yaml
import copy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also here - did this import end up unneeded?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, I'm not seeing any usage of the copy library here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, missed this. Will remove.


# This script takes all ECS and custom fields already loaded, and lets users
# filter out the ones they don't need.


def filter(fields, subset_file_globs):
'''
Takes the deeply nested field structure and the subset file names.

It returns a copy of the fields that matches the whitelist defined in the subset.
'''
if not subset_file_globs or subset_file_globs == []:
return fields
subset_definitions = load_subset_definitions(subset_file_globs)
filtered_fields = extract_matching_fields(fields, subset_definitions)
return filtered_fields
def combine_all_subsets(subsets):
'''Merges N subsets into one. Strips top level 'name' and 'fields' keys as well as non-ECS field options since we can't know how to merge those.'''
merged_subset = {}
for subset in subsets:
strip_non_ecs_options(subset['fields'])
merge_subsets(merged_subset, subset['fields'])
return merged_subset


def load_subset_definitions(file_globs):
subsets = {}
if not file_globs:
return []
subsets = []
for f in eval_globs(file_globs):
raw = load_yaml_file(f)
merge_subsets(subsets, raw)
subsets.append(raw)
if not subsets:
raise ValueError('--subset specified, but no subsets found in {}'.format(file_globs))
return subsets
Expand Down Expand Up @@ -50,29 +49,56 @@ def warn(message):
print(message)


ecs_options = ['fields', 'enabled', 'index']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So would idea here be that we could set enabled and index per subset if we wanted instead of having to do it globally in the custom schema files for example like we are doing here: https://github.com/elastic/endpoint-package/blob/master/custom_schemas/custom_endpoint.yml#L28

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, and when merging subsets together the field is enabled in the index if it is enabled in any of the subsets. This also lets us easily disable indexing on ECS fields.



def strip_non_ecs_options(subset):
for key in subset:
subset[key] = {x: subset[key][x] for x in subset[key] if x in ecs_options}
if 'fields' in subset[key] and isinstance(subset[key]['fields'], dict):
strip_non_ecs_options(subset[key]['fields'])


def merge_subsets(a, b):
'''Merges field subset definitions together. The b subset is merged into the a subset.'''
'''Merges field subset definitions together. The b subset is merged into the a subset. Assumes that subsets have been stripped of non-ecs options.'''
for key in b:
if key not in a:
a[key] = b[key]
elif 'fields' not in a[key] or 'fields' not in b[key] or b[key]['fields'] == '*':
a[key]['fields'] = '*'
elif isinstance(a[key]['fields'], dict) and isinstance(b[key]['fields'], dict):
merge_subsets(a[key]['fields'], b[key]['fields'])
elif 'fields' in a[key] and 'fields' in b[key]:
if b[key]['fields'] == '*':
a[key]['fields'] = '*'
elif isinstance(a[key]['fields'], dict) and isinstance(b[key]['fields'], dict):
merge_subsets(a[key]['fields'], b[key]['fields'])
elif 'fields' in a[key] or 'fields' in b[key]:
raise ValueError("Subsets unmergeable: 'fields' found in key '{}' in only one subset".format(key))
# If both subsets have enabled set to False, this will leave enabled: False in the merged subset
# Otherwise, enabled is removed and is implicitly true
if a[key].get('enabled', True) or b[key].get('enabled', True):
a[key].pop('enabled', None)
# Same logic from 'enabled' applies to 'index'
if a[key].get('index', True) or b[key].get('index', True):
a[key].pop('index', None)


def extract_matching_fields(fields, subset_definitions):
retained_fields = {}
allowed_options = ['fields']
'''Removes fields that are not in the subset definition. Returns a copy without modifying the input fields dict.'''
retained_fields = {x: fields[x].copy() for x in subset_definitions}
for key, val in subset_definitions.items():
if 'field_details' in fields[key]:
retained_fields[key]['field_details'] = fields[key]['field_details'].copy()
for option in val:
if option not in allowed_options:
raise ValueError('Unsupported option found in subset: {}'.format(option))
# A missing fields key is shorthand for including all subfields
if 'fields' not in val or val['fields'] == '*':
retained_fields[key] = fields[key]
elif isinstance(val['fields'], dict):
# Copy the full field over so we get all the options, then replace the 'fields' with the right subset
retained_fields[key] = fields[key]
retained_fields[key]['fields'] = extract_matching_fields(fields[key]['fields'], val['fields'])
if option != 'fields':
retained_fields[key].setdefault('field_details', {})
retained_fields[key]['field_details'][option] = val[option]
# If the field in the schema has a 'fields' key, we expect a 'fields' key in the subset
if 'fields' in fields[key]:
if 'fields' not in val:
raise ValueError("'fields' key expected, not found in subset for {}".format(key))
elif isinstance(val['fields'], dict):
retained_fields[key]['fields'] = extract_matching_fields(fields[key]['fields'], val['fields'])
elif val['fields'] != "*":
raise ValueError("Unexpected value '{}' found in 'fields' key".format(val['fields']))
# If the field in the schema does not have a 'fields' key, there should not be a 'fields' key in the subset
elif 'fields' in val:
raise ValueError("'fields' key not expected, found in subset for {}".format(key))
return retained_fields
78 changes: 70 additions & 8 deletions scripts/tests/unit/test_schema_subset_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ def test_basic_merging(self):

def test_merging_superset(self):
# 'log' is used to test superset with the explicit '{'fields': '*'}' notation
# 'process' is used to test superset with the shorhand '{}' notation
supersets = {'log': {'fields': '*'}, 'process': {}}
supersets = {'log': {'fields': '*'}, 'process': {'fields': '*'}}
supserseded = {
'log': {'fields': {'syslog': {'fields': '*'}}},
'process': {'fields': {'parent': {'fields': '*'}}},
Expand All @@ -55,6 +54,50 @@ def test_merging_superset(self):
subset_filter.merge_subsets(subsets, supersets)
self.assertEqual(subsets, supersets)

def test_subset_option_merging(self):
subset1 = {
'log': {'enabled': False},
'network': {'enabled': False, 'fields': '*'},
'base': {'fields': {'message': {'index': False}}},
}
subset2 = {
'log': {'enabled': False},
'network': {'fields': '*'},
'base': {'fields': {'message': {}}},
}
expected = {
'log': {'enabled': False},
'network': {'fields': '*'},
'base': {'fields': {'message': {}}},
}
merged = {}
subset_filter.merge_subsets(merged, subset1)
subset_filter.merge_subsets(merged, subset2)
self.assertEqual(merged, expected)

def test_strip_non_ecs_options(self):
subset = {
'log': {
'custom_option': True,
'enabled': False,
'fields': {
'syslog': {
'custom_option': True
}
}
}
}
expected = {
'log': {
'enabled': False,
'fields': {
'syslog': {}
}
}
}
subset_filter.strip_non_ecs_options(subset)
self.assertEqual(subset, expected)

def schema_log(self):
return {
'log': {
Expand Down Expand Up @@ -91,18 +134,13 @@ def schema_log(self):
}
}

def test_extract_matching_fields_shorthand_notation(self):
subset = {'log': {}}
filtered_fields = subset_filter.extract_matching_fields(self.schema_log(), subset)
self.assertEqual(filtered_fields, self.schema_log())

def test_extract_matching_fields_explicit_all_fields_notation(self):
subset = {'log': {'fields': '*'}}
filtered_fields = subset_filter.extract_matching_fields(self.schema_log(), subset)
self.assertEqual(filtered_fields, self.schema_log())

def test_extract_matching_fields_subfields_only_notation(self):
subset = {'log': {'fields': {'origin': {}}}}
subset = {'log': {'fields': {'origin': {'fields': '*'}}}}
filtered_fields = subset_filter.extract_matching_fields(self.schema_log(), subset)
expected_fields = {
'log': {
Expand Down Expand Up @@ -158,3 +196,27 @@ def test_extract_matching_individual_field(self):
}
}
self.assertEqual(filtered_fields, expected_fields)

def test_extract_field_with_options(self):
subset = {'log': {'enabled': False, 'fields': {'level': {'custom_option': True}}}}
filtered_fields = subset_filter.extract_matching_fields(self.schema_log(), subset)
expected_fields = {
'log': {
'schema_details': {'root': False},
'field_details': {
'name': 'log',
'type': 'group',
'enabled': False
},
'fields': {
'level': {
'field_details': {
'name': 'level',
'type': 'keyword',
'custom_option': True
}
}
}
}
}
self.assertEqual(filtered_fields, expected_fields)