From e5cd3a80be69e925fcb2ebddd0e9ec16cdac7d57 Mon Sep 17 00:00:00 2001 From: migueldo Date: Thu, 7 Jul 2022 18:36:36 +0200 Subject: [PATCH 1/2] emlParser with added functionnalities for PR --- analyzers/EmlParser/EmlParser.json | 10 +- analyzers/EmlParser/parse.py | 152 ++++++++++++++++++----------- 2 files changed, 102 insertions(+), 60 deletions(-) diff --git a/analyzers/EmlParser/EmlParser.json b/analyzers/EmlParser/EmlParser.json index 0cda14779..8ac46027e 100644 --- a/analyzers/EmlParser/EmlParser.json +++ b/analyzers/EmlParser/EmlParser.json @@ -1,6 +1,6 @@ { "name": "EmlParser", - "version": "2.0", + "version": "2.1", "author": "StrangeBee", "url": "https://github.com/TheHive-Project/Cortex-Analyzers", "license": "AGPL-V3", @@ -24,6 +24,14 @@ "multi": false, "required": true }, + { + "name": "sanitized_rendering", + "description": "If wkhtmltoimage fails because some imports can't be loaded the rendering will be done without them. If disabled an error message will be displayed instead.", + "defaultValue": false, + "type": "boolean", + "multi": false, + "required": true + }, { "name": "wkhtmltoimage_path", "description": "Path of wkhtmltoimage program on the system. This program is required to generate visualisation of the message as it seen in mail client program. If using Docker image, use default configuration.", diff --git a/analyzers/EmlParser/parse.py b/analyzers/EmlParser/parse.py index 55b3add50..b792273cd 100755 --- a/analyzers/EmlParser/parse.py +++ b/analyzers/EmlParser/parse.py @@ -1,25 +1,28 @@ #!/usr/bin/env python3 # encoding: utf-8 +import base64 +import binascii import datetime import os +import re +from io import BytesIO + import eml_parser -from cortexutils.analyzer import Analyzer -import magic -import binascii -import base64 import imgkit +import magic from PIL import Image -from io import BytesIO from bs4 import BeautifulSoup +from cortexutils.analyzer import Analyzer + # TODO: Optional: add a flavor: with image (the other one gives all http links found in the message, can be run as a second analysis. Manage PAP/TLP, use at your own risk) - + class EmlParserAnalyzer(Analyzer): def __init__(self): Analyzer.__init__(self) - #filename of the observable + # filename of the observable self.filename = self.getParam('attachment.name', 'noname.ext') self.filepath = self.getParam('file', None, 'File is missing') @@ -30,12 +33,13 @@ def __init__(self): 'config.wkhtmltoimage_path', '/usr/bin/wkhtmltoimage'), 'width_size': self.get_param('config.width_size', 1024) } - + self.sanitized_rendering = self.get_param('config.sanitized_rendering', False) + def run(self): if self.data_type == 'file': try: parsingResult = parseEml( - self.filepath, self.job_directory, self.wkhtmltoimage) + self.filepath, self.job_directory, self.wkhtmltoimage, self.sanitized_rendering) self.report(parsingResult) except Exception as e: # self.unexpectedError(e) @@ -53,12 +57,12 @@ def summary(self, raw): predicate_urls = "Urls" value_attachments = "0" value_urls = "0" - + # Get values if 'attachments' in raw: value_attachments = len(raw['attachments']) - if 'url' in raw.get('iocs'): - value_urls = len(raw.get('iocs').get('url')) + # if 'url' in raw.get('iocs'): + # value_urls = len(raw.get('iocs').get('url')) # Build summary taxonomies.append(self.build_taxonomy( @@ -72,37 +76,34 @@ def artifacts(self, raw): urls = raw.get('iocs').get('url') ip = raw.get('iocs').get('ip') domains = raw.get('iocs').get('domain') - - ## Extract email addresses mail_addresses = raw.get('iocs').get('email') hashes = raw.get('iocs').get('hash') if urls: for u in urls: - artifacts.append(self.build_artifact('url',str(u))) + artifacts.append(self.build_artifact('url', str(u["data"]), tags=u["tag"] + ['autoImport:true'])) if ip: for i in ip: - artifacts.append(self.build_artifact('ip',str(i))) + artifacts.append(self.build_artifact('ip', str(i["data"]), tags=i["tag"])) if mail_addresses: for e in mail_addresses: - artifacts.append(self.build_artifact('mail',str(e))) - if domains: - for e in domains: - artifacts.append(self.build_artifact('domain',str(e))) + artifacts.append(self.build_artifact('mail', str(e["data"]), tags=e["tag"] + ['autoImport:true'])) + if domains: + for d in domains: + artifacts.append(self.build_artifact('domain', str(d["data"]), tags=d["tag"])) if hashes: - for h in hashes: - artifacts.append(self.build_artifact('hash',str(h.get('hash')))) - artifacts.append(self.build_artifact('filename',str(h['filename']))) + for h in hashes: + artifacts.append( + self.build_artifact('hash', str(h["hash"]), tags=["body:attachment", "autoImport:true"] + h["tag"])) + artifacts.append(self.build_artifact('filename', str(h['filename']), + tags=["body:attachment", "autoImport:true"] + h["tag"])) filepath = os.path.join(self.job_directory, 'output', h.get('filename')) - artifacts.append(self.build_artifact('file', filepath)) - - # if 'text_html' in raw.get('body'): - # urls.extend(raw.get('body').get('text_html').get('uri') + artifacts.append( + self.build_artifact('file', filepath, tags=["body:attachment", "autoImport:true"] + h["tag"])) return artifacts -def parseEml(filepath, job_directory, wkhtmltoimage): - +def parseEml(filepath, job_directory, wkhtmltoimage, sanitized_rendering): ep = eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True) with open(filepath, 'rb') as f: raw_email = f.read() @@ -123,7 +124,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage): ## ## Extract raw email - ## + ## result['raw_email'] = raw_email.decode('utf-8') ## ## Extract SMTP envelope @@ -136,8 +137,8 @@ def parseEml(filepath, job_directory, wkhtmltoimage): 'header').get('header').get('x-delivered-to', '') ## - ## Extract Headers - ## + ## Extract Headers + ## headers['from'] = decoded_email.get('header').get('header').get('from', []) headers['to'] = decoded_email.get('header').get('header').get('to', []) headers['cc'] = decoded_email.get('header').get('header').get('cc', []) @@ -147,7 +148,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage): headers['date'] = decoded_email.get('header').get('header').get('date', '')[0] headers['received'] = decoded_email.get('header').get('received') # Make dates ready for json - for h in headers['received']: + for h in headers['received']: if isinstance(h.get('date'), datetime.datetime): d = h.get('date').isoformat() h['date'] = d @@ -155,7 +156,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage): ## ## Extract body text/plain and text/html - ## + ## body = dict() if 'body' in decoded_email: body['text_plain'] = list() @@ -163,19 +164,29 @@ def parseEml(filepath, job_directory, wkhtmltoimage): for b in decoded_email.get('body'): ## text/plain if b.get('content_type') == "text/plain": - body['text_plain'].append(b) + body['text_plain'].append(b) b['beautified_text'] = BeautifulSoup( - b.get('content'), 'html.parser').prettify() - iocs['url'].extend(ep.get_uri_ondata(b.get('content'))) - + b.get('content'), 'html.parser').prettify() + for url in ep.get_uri_ondata(b.get('content')): + iocs['url'].append({"data": url, "tag": ["body:text/plain"]}) + ## text/html elif b.get('content_type') == "text/html": - iocs['url'].extend(ep.get_uri_ondata(b.get('content'))) - - ## Generate rendering image if option is enabled - if wkhtmltoimage.get('enable'): + for url in ep.get_uri_ondata(b.get('content')): + iocs['url'].append({"data": url, "tag": ["body:text/html"]}) - img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp") + ## Generate rendering image if option is enabled + if wkhtmltoimage.get('enable'): + try: + img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp") + except Exception as e: + try: + b["content"] = remove_html_imports(b["content"], e) + img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp") + except Exception as e: + b[ + "content"] = '
WARNING: this page cannot be rendered because some imports failed
' + img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp") b['rendered_html'] = "data:{};base64,{}".format( "image/png", base64_image(img_file.get('img_path'), @@ -184,13 +195,13 @@ def parseEml(filepath, job_directory, wkhtmltoimage): ) b['beautified_html'] = BeautifulSoup( b.get('content'), 'html.parser').prettify() - + body['text_html'].append(b) result['body'] = body ## ## Extract Attachments - ## + ## result['attachments'] = list() if 'attachment' in decoded_email.keys(): for a in decoded_email.get('attachment'): @@ -202,32 +213,38 @@ def parseEml(filepath, job_directory, wkhtmltoimage): f.close() a['raw'] = a.get('raw').decode('ascii') result['attachments'].append(a) - iocs['hash'].extend([{ + iocs['hash'].append({ 'hash': a.get('hash').get('sha256'), - 'filename': a.get('filename') - }]) - + 'filename': a.get('filename'), + 'tag': ["content-type:{}".format(a.get('content_header').get('content-type')[0].split(';')[0])] + }) + ## ## Extract IOCs - ## - iocs['ip'].extend(decoded_email.get('header').get('received_ip', [])) - iocs['domain'].extend(decoded_email.get('header').get('received_domain', [])) + ## + for ip in decoded_email.get('header').get('received_ip', []): + iocs['ip'].append({"data": ip, "tag": ["header:Received"]}) + for domain in decoded_email.get('header').get('received_domain', []): + iocs['domain'].append({"data": domain, "tag": ["header:Received"]}) ### Email - for field in ['cc', + for field in ['cc', 'bcc', 'delivered_to', 'received_foremail', ]: - iocs['email'].extend(decoded_email.get('header').get(field, [])) - iocs['email'].append(decoded_email.get('header').get('from', '')) + for email in decoded_email.get('header').get(field, []): + if field == "delivered_to": + iocs['email'].append({"data": email, "tag": ["header:To"]}) + else: + iocs['email'].append({"data": email, "tag": ["header:{}".format(field.capitalize())]}) + iocs['email'].append({'data': decoded_email.get('header').get('from', ''), "tag": ["header:From"]}) result['iocs'] = iocs return result -def convert_png(content: str, i, wkhtmltoimage_path:str, output_path: str): - +def convert_png(content: str, i, wkhtmltoimage_path: str, output_path: str): config = imgkit.config( wkhtmltoimage=wkhtmltoimage_path ) @@ -235,9 +252,9 @@ def convert_png(content: str, i, wkhtmltoimage_path:str, output_path: str): 'encoding': 'UTF-8', 'disable-javascript': '', 'load-media-error-handling': 'ignore', - 'load-error-handling':'ignore' + 'load-error-handling': 'ignore' } - imgkit.from_string(content, + imgkit.from_string(content, "{}/{}.png".format(output_path, i), options=options, config=config @@ -272,5 +289,22 @@ def base64_image(img_path, width): except Exception as e: return "No image" + +def remove_html_imports(html_str, txt): + """ + Remove all import statements from the html string. + """ + body_pattern = r']*>' + import_pattern = '\S+="https?:\/\/\S+"' + warning = '
WARNING: this page was modified for rendering because some imports failed
' + splitted_html = html_str.splitlines() + for index, line in enumerate(splitted_html): + if re.search(body_pattern, line): + splitted_html.insert(index + 1, warning) + sanitazed_line = re.sub(import_pattern, '', line) + splitted_html[index] = sanitazed_line + return "\r\n".join(splitted_html) + + if __name__ == '__main__': EmlParserAnalyzer().run() From 7937ad93f1b1da2c325ef2062f9d7932f0760b79 Mon Sep 17 00:00:00 2001 From: migueldo Date: Thu, 7 Jul 2022 18:37:18 +0200 Subject: [PATCH 2/2] removed sanitized rendering --- analyzers/EmlParser/EmlParser.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/analyzers/EmlParser/EmlParser.json b/analyzers/EmlParser/EmlParser.json index 8ac46027e..1a5511815 100644 --- a/analyzers/EmlParser/EmlParser.json +++ b/analyzers/EmlParser/EmlParser.json @@ -24,14 +24,6 @@ "multi": false, "required": true }, - { - "name": "sanitized_rendering", - "description": "If wkhtmltoimage fails because some imports can't be loaded the rendering will be done without them. If disabled an error message will be displayed instead.", - "defaultValue": false, - "type": "boolean", - "multi": false, - "required": true - }, { "name": "wkhtmltoimage_path", "description": "Path of wkhtmltoimage program on the system. This program is required to generate visualisation of the message as it seen in mail client program. If using Docker image, use default configuration.",