Merge pull request #1109 from myggl/develop

Improved emlParser
TheHive-Project · Jul 8, 2022 · eb37a3f · eb37a3f
2 parents eb248f6 + 7937ad9
commit eb37a3f
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 60 deletions.
diff --git a/analyzers/EmlParser/EmlParser.json b/analyzers/EmlParser/EmlParser.json
@@ -1,6 +1,6 @@
 {
     "name": "EmlParser",
-    "version": "2.0",
+    "version": "2.1",
     "author": "StrangeBee",
     "url": "https://github.com/TheHive-Project/Cortex-Analyzers",
     "license": "AGPL-V3",

diff --git a/analyzers/EmlParser/parse.py b/analyzers/EmlParser/parse.py
@@ -1,25 +1,28 @@
 #!/usr/bin/env python3
 # encoding: utf-8
+import base64
+import binascii
 import datetime
 import os
+import re
+from io import BytesIO
+
 import eml_parser
-from cortexutils.analyzer import Analyzer
-import magic
-import binascii
-import base64
 import imgkit
+import magic
 from PIL import Image
-from io import BytesIO
 from bs4 import BeautifulSoup
+from cortexutils.analyzer import Analyzer
+
 
 # TODO: Optional: add a flavor: with image (the other one gives all http links found in the message, can be run as a second analysis. Manage PAP/TLP, use at your own risk)
- 
+
 
 class EmlParserAnalyzer(Analyzer):
 
     def __init__(self):
         Analyzer.__init__(self)
-        #filename of the observable
+        # filename of the observable
         self.filename = self.getParam('attachment.name', 'noname.ext')
         self.filepath = self.getParam('file', None, 'File is missing')
 
@@ -30,12 +33,13 @@ def __init__(self):
                 'config.wkhtmltoimage_path', '/usr/bin/wkhtmltoimage'),
             'width_size': self.get_param('config.width_size', 1024)
         }
-
+        self.sanitized_rendering = self.get_param('config.sanitized_rendering', False)
+
     def run(self):
         if self.data_type == 'file':
             try:
                 parsingResult = parseEml(
-                    self.filepath, self.job_directory, self.wkhtmltoimage)
+                    self.filepath, self.job_directory, self.wkhtmltoimage, self.sanitized_rendering)
                 self.report(parsingResult)
             except Exception as e:
                 # self.unexpectedError(e)
@@ -53,12 +57,12 @@ def summary(self, raw):
         predicate_urls = "Urls"
         value_attachments = "0"
         value_urls = "0"
-        
+
         # Get values
         if 'attachments' in raw:
             value_attachments = len(raw['attachments'])
-        if 'url' in raw.get('iocs'):
-            value_urls = len(raw.get('iocs').get('url'))
+        # if 'url' in raw.get('iocs'):
+        # value_urls = len(raw.get('iocs').get('url'))
 
         # Build summary
         taxonomies.append(self.build_taxonomy(
@@ -72,37 +76,34 @@ def artifacts(self, raw):
         urls = raw.get('iocs').get('url')
         ip = raw.get('iocs').get('ip')
         domains = raw.get('iocs').get('domain')
-
-        ## Extract email addresses
         mail_addresses = raw.get('iocs').get('email')
         hashes = raw.get('iocs').get('hash')
 
         if urls:
             for u in urls:
-                artifacts.append(self.build_artifact('url',str(u)))
+                artifacts.append(self.build_artifact('url', str(u["data"]), tags=u["tag"] + ['autoImport:true']))
         if ip:
             for i in ip:
-                artifacts.append(self.build_artifact('ip',str(i)))
+                artifacts.append(self.build_artifact('ip', str(i["data"]), tags=i["tag"]))
         if mail_addresses:
             for e in mail_addresses:
-                artifacts.append(self.build_artifact('mail',str(e)))
-        if domains: 
-            for e in domains:
-                artifacts.append(self.build_artifact('domain',str(e)))
+                artifacts.append(self.build_artifact('mail', str(e["data"]), tags=e["tag"] + ['autoImport:true']))
+        if domains:
+            for d in domains:
+                artifacts.append(self.build_artifact('domain', str(d["data"]), tags=d["tag"]))
         if hashes:
-             for h in hashes:
-                artifacts.append(self.build_artifact('hash',str(h.get('hash'))))
-                artifacts.append(self.build_artifact('filename',str(h['filename']))) 
+            for h in hashes:
+                artifacts.append(
+                    self.build_artifact('hash', str(h["hash"]), tags=["body:attachment", "autoImport:true"] + h["tag"]))
+                artifacts.append(self.build_artifact('filename', str(h['filename']),
+                                                     tags=["body:attachment", "autoImport:true"] + h["tag"]))
                 filepath = os.path.join(self.job_directory, 'output', h.get('filename'))
-                artifacts.append(self.build_artifact('file', filepath))
-
-        # if 'text_html' in raw.get('body'):
-        #     urls.extend(raw.get('body').get('text_html').get('uri')  
+                artifacts.append(
+                    self.build_artifact('file', filepath, tags=["body:attachment", "autoImport:true"] + h["tag"]))
         return artifacts
 
 
-def parseEml(filepath, job_directory, wkhtmltoimage):
-
+def parseEml(filepath, job_directory, wkhtmltoimage, sanitized_rendering):
     ep = eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True)
     with open(filepath, 'rb') as f:
         raw_email = f.read()
@@ -123,7 +124,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
 
     ##
     ## Extract raw email
-    ## 
+    ##
     result['raw_email'] = raw_email.decode('utf-8')
     ##
     ## Extract SMTP envelope
@@ -136,8 +137,8 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
         'header').get('header').get('x-delivered-to', '')
 
     ##
-    ## Extract Headers 
-    ## 
+    ## Extract Headers
+    ##
     headers['from'] = decoded_email.get('header').get('header').get('from', [])
     headers['to'] = decoded_email.get('header').get('header').get('to', [])
     headers['cc'] = decoded_email.get('header').get('header').get('cc', [])
@@ -147,35 +148,45 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
     headers['date'] = decoded_email.get('header').get('header').get('date', '')[0]
     headers['received'] = decoded_email.get('header').get('received')
     # Make dates ready for json
-    for h in headers['received']: 
+    for h in headers['received']:
         if isinstance(h.get('date'), datetime.datetime):
             d = h.get('date').isoformat()
             h['date'] = d
     result['headers'] = headers
 
     ##
     ## Extract body text/plain and text/html
-    ## 
+    ##
     body = dict()
     if 'body' in decoded_email:
         body['text_plain'] = list()
         body['text_html'] = list()
         for b in decoded_email.get('body'):
             ## text/plain
             if b.get('content_type') == "text/plain":
-                body['text_plain'].append(b)                
+                body['text_plain'].append(b)
                 b['beautified_text'] = BeautifulSoup(
-                        b.get('content'), 'html.parser').prettify()
-                iocs['url'].extend(ep.get_uri_ondata(b.get('content')))
-
+                    b.get('content'), 'html.parser').prettify()
+                for url in ep.get_uri_ondata(b.get('content')):
+                    iocs['url'].append({"data": url, "tag": ["body:text/plain"]})
+
             ## text/html
             elif b.get('content_type') == "text/html":
-                iocs['url'].extend(ep.get_uri_ondata(b.get('content')))
-
-               ## Generate rendering image if option is enabled
-                if wkhtmltoimage.get('enable'):
+                for url in ep.get_uri_ondata(b.get('content')):
+                    iocs['url'].append({"data": url, "tag": ["body:text/html"]})
 
-                    img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
+                ## Generate rendering image if option is enabled
+                if wkhtmltoimage.get('enable'):
+                    try:
+                        img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
+                    except Exception as e:
+                        try:
+                            b["content"] = remove_html_imports(b["content"], e)
+                            img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
+                        except Exception as e:
+                            b[
+                                "content"] = '<html><body><div style="background-color:red; color:white; text-align: center;"><strong>WARNING:</strong> this page cannot be rendered because some imports failed</div></body></html>'
+                            img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
                     b['rendered_html'] = "data:{};base64,{}".format(
                         "image/png",
                         base64_image(img_file.get('img_path'),
@@ -184,13 +195,13 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
                     )
                     b['beautified_html'] = BeautifulSoup(
                         b.get('content'), 'html.parser').prettify()
-                
+
                 body['text_html'].append(b)
     result['body'] = body
 
     ##
     ## Extract Attachments
-    ## 
+    ##
     result['attachments'] = list()
     if 'attachment' in decoded_email.keys():
         for a in decoded_email.get('attachment'):
@@ -202,42 +213,48 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
                 f.close()
                 a['raw'] = a.get('raw').decode('ascii')
             result['attachments'].append(a)
-            iocs['hash'].extend([{
+            iocs['hash'].append({
                 'hash': a.get('hash').get('sha256'),
-                'filename': a.get('filename')
-            }])
-
+                'filename': a.get('filename'),
+                'tag': ["content-type:{}".format(a.get('content_header').get('content-type')[0].split(';')[0])]
+            })
+
     ##
     ## Extract IOCs
-    ## 
-    iocs['ip'].extend(decoded_email.get('header').get('received_ip', []))
-    iocs['domain'].extend(decoded_email.get('header').get('received_domain', []))
+    ##
+    for ip in decoded_email.get('header').get('received_ip', []):
+        iocs['ip'].append({"data": ip, "tag": ["header:Received"]})
+    for domain in decoded_email.get('header').get('received_domain', []):
+        iocs['domain'].append({"data": domain, "tag": ["header:Received"]})
     ### Email
-    for field in ['cc', 
+    for field in ['cc',
                   'bcc',
                   'delivered_to',
                   'received_foremail',
                   ]:
-        iocs['email'].extend(decoded_email.get('header').get(field, []))
-    iocs['email'].append(decoded_email.get('header').get('from', ''))
+        for email in decoded_email.get('header').get(field, []):
+            if field == "delivered_to":
+                iocs['email'].append({"data": email, "tag": ["header:To"]})
+            else:
+                iocs['email'].append({"data": email, "tag": ["header:{}".format(field.capitalize())]})
+    iocs['email'].append({'data': decoded_email.get('header').get('from', ''), "tag": ["header:From"]})
 
     result['iocs'] = iocs
 
     return result
 
 
-def convert_png(content: str, i, wkhtmltoimage_path:str, output_path: str):
-
+def convert_png(content: str, i, wkhtmltoimage_path: str, output_path: str):
     config = imgkit.config(
         wkhtmltoimage=wkhtmltoimage_path
     )
     options = {'no-images': '',
                'encoding': 'UTF-8',
                'disable-javascript': '',
                'load-media-error-handling': 'ignore',
-                'load-error-handling':'ignore'
+               'load-error-handling': 'ignore'
                }
-    imgkit.from_string(content, 
+    imgkit.from_string(content,
                        "{}/{}.png".format(output_path, i),
                        options=options,
                        config=config
@@ -272,5 +289,22 @@ def base64_image(img_path, width):
     except Exception as e:
         return "No image"
 
+
+def remove_html_imports(html_str, txt):
+    """
+    Remove all import statements from the html string.
+    """
+    body_pattern = r'<body[^>]*>'
+    import_pattern = '\S+="https?:\/\/\S+"'
+    warning = '<div style="background-color:red; color:white; text-align: center;"><strong>WARNING:</strong> this page was modified for rendering because some imports failed</div>'
+    splitted_html = html_str.splitlines()
+    for index, line in enumerate(splitted_html):
+        if re.search(body_pattern, line):
+            splitted_html.insert(index + 1, warning)
+        sanitazed_line = re.sub(import_pattern, '', line)
+        splitted_html[index] = sanitazed_line
+    return "\r\n".join(splitted_html)
+
+
 if __name__ == '__main__':
     EmlParserAnalyzer().run()