From e5cd3a80be69e925fcb2ebddd0e9ec16cdac7d57 Mon Sep 17 00:00:00 2001
From: migueldo <mail.deoliveiramiguel@gmail.com>
Date: Thu, 7 Jul 2022 18:36:36 +0200
Subject: [PATCH 1/2] emlParser with added functionnalities for PR

---
 analyzers/EmlParser/EmlParser.json |  10 +-
 analyzers/EmlParser/parse.py       | 152 ++++++++++++++++++-----------
 2 files changed, 102 insertions(+), 60 deletions(-)

diff --git a/analyzers/EmlParser/EmlParser.json b/analyzers/EmlParser/EmlParser.json
index 0cda14779..8ac46027e 100644
--- a/analyzers/EmlParser/EmlParser.json
+++ b/analyzers/EmlParser/EmlParser.json
@@ -1,6 +1,6 @@
 {
     "name": "EmlParser",
-    "version": "2.0",
+    "version": "2.1",
     "author": "StrangeBee",
     "url": "https://github.com/TheHive-Project/Cortex-Analyzers",
     "license": "AGPL-V3",
@@ -24,6 +24,14 @@
             "multi": false,
             "required": true
         },
+        {
+            "name": "sanitized_rendering",
+            "description": "If wkhtmltoimage fails because some imports can't be loaded the rendering will be done without them. If disabled an error message will be displayed instead.",
+            "defaultValue": false,
+            "type": "boolean",
+            "multi": false,
+            "required": true
+        },
         {
             "name": "wkhtmltoimage_path",
             "description": "Path of wkhtmltoimage program on the system. This program is required to generate visualisation of the message as it seen in mail client program. If using Docker image, use default configuration.",
diff --git a/analyzers/EmlParser/parse.py b/analyzers/EmlParser/parse.py
index 55b3add50..b792273cd 100755
--- a/analyzers/EmlParser/parse.py
+++ b/analyzers/EmlParser/parse.py
@@ -1,25 +1,28 @@
 #!/usr/bin/env python3
 # encoding: utf-8
+import base64
+import binascii
 import datetime
 import os
+import re
+from io import BytesIO
+
 import eml_parser
-from cortexutils.analyzer import Analyzer
-import magic
-import binascii
-import base64
 import imgkit
+import magic
 from PIL import Image
-from io import BytesIO
 from bs4 import BeautifulSoup
+from cortexutils.analyzer import Analyzer
+
 
 # TODO: Optional: add a flavor: with image (the other one gives all http links found in the message, can be run as a second analysis. Manage PAP/TLP, use at your own risk)
- 
+
 
 class EmlParserAnalyzer(Analyzer):
 
     def __init__(self):
         Analyzer.__init__(self)
-        #filename of the observable
+        # filename of the observable
         self.filename = self.getParam('attachment.name', 'noname.ext')
         self.filepath = self.getParam('file', None, 'File is missing')
 
@@ -30,12 +33,13 @@ def __init__(self):
                 'config.wkhtmltoimage_path', '/usr/bin/wkhtmltoimage'),
             'width_size': self.get_param('config.width_size', 1024)
         }
-    
+        self.sanitized_rendering = self.get_param('config.sanitized_rendering', False)
+
     def run(self):
         if self.data_type == 'file':
             try:
                 parsingResult = parseEml(
-                    self.filepath, self.job_directory, self.wkhtmltoimage)
+                    self.filepath, self.job_directory, self.wkhtmltoimage, self.sanitized_rendering)
                 self.report(parsingResult)
             except Exception as e:
                 # self.unexpectedError(e)
@@ -53,12 +57,12 @@ def summary(self, raw):
         predicate_urls = "Urls"
         value_attachments = "0"
         value_urls = "0"
-        
+
         # Get values
         if 'attachments' in raw:
             value_attachments = len(raw['attachments'])
-        if 'url' in raw.get('iocs'):
-            value_urls = len(raw.get('iocs').get('url'))
+        # if 'url' in raw.get('iocs'):
+        # value_urls = len(raw.get('iocs').get('url'))
 
         # Build summary
         taxonomies.append(self.build_taxonomy(
@@ -72,37 +76,34 @@ def artifacts(self, raw):
         urls = raw.get('iocs').get('url')
         ip = raw.get('iocs').get('ip')
         domains = raw.get('iocs').get('domain')
-        
-        ## Extract email addresses
         mail_addresses = raw.get('iocs').get('email')
         hashes = raw.get('iocs').get('hash')
 
         if urls:
             for u in urls:
-                artifacts.append(self.build_artifact('url',str(u)))
+                artifacts.append(self.build_artifact('url', str(u["data"]), tags=u["tag"] + ['autoImport:true']))
         if ip:
             for i in ip:
-                artifacts.append(self.build_artifact('ip',str(i)))
+                artifacts.append(self.build_artifact('ip', str(i["data"]), tags=i["tag"]))
         if mail_addresses:
             for e in mail_addresses:
-                artifacts.append(self.build_artifact('mail',str(e)))
-        if domains: 
-            for e in domains:
-                artifacts.append(self.build_artifact('domain',str(e)))
+                artifacts.append(self.build_artifact('mail', str(e["data"]), tags=e["tag"] + ['autoImport:true']))
+        if domains:
+            for d in domains:
+                artifacts.append(self.build_artifact('domain', str(d["data"]), tags=d["tag"]))
         if hashes:
-             for h in hashes:
-                artifacts.append(self.build_artifact('hash',str(h.get('hash'))))
-                artifacts.append(self.build_artifact('filename',str(h['filename']))) 
+            for h in hashes:
+                artifacts.append(
+                    self.build_artifact('hash', str(h["hash"]), tags=["body:attachment", "autoImport:true"] + h["tag"]))
+                artifacts.append(self.build_artifact('filename', str(h['filename']),
+                                                     tags=["body:attachment", "autoImport:true"] + h["tag"]))
                 filepath = os.path.join(self.job_directory, 'output', h.get('filename'))
-                artifacts.append(self.build_artifact('file', filepath))
-        
-        # if 'text_html' in raw.get('body'):
-        #     urls.extend(raw.get('body').get('text_html').get('uri')  
+                artifacts.append(
+                    self.build_artifact('file', filepath, tags=["body:attachment", "autoImport:true"] + h["tag"]))
         return artifacts
 
 
-def parseEml(filepath, job_directory, wkhtmltoimage):
-
+def parseEml(filepath, job_directory, wkhtmltoimage, sanitized_rendering):
     ep = eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True)
     with open(filepath, 'rb') as f:
         raw_email = f.read()
@@ -123,7 +124,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
 
     ##
     ## Extract raw email
-    ## 
+    ##
     result['raw_email'] = raw_email.decode('utf-8')
     ##
     ## Extract SMTP envelope
@@ -136,8 +137,8 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
         'header').get('header').get('x-delivered-to', '')
 
     ##
-    ## Extract Headers 
-    ## 
+    ## Extract Headers
+    ##
     headers['from'] = decoded_email.get('header').get('header').get('from', [])
     headers['to'] = decoded_email.get('header').get('header').get('to', [])
     headers['cc'] = decoded_email.get('header').get('header').get('cc', [])
@@ -147,7 +148,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
     headers['date'] = decoded_email.get('header').get('header').get('date', '')[0]
     headers['received'] = decoded_email.get('header').get('received')
     # Make dates ready for json
-    for h in headers['received']: 
+    for h in headers['received']:
         if isinstance(h.get('date'), datetime.datetime):
             d = h.get('date').isoformat()
             h['date'] = d
@@ -155,7 +156,7 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
 
     ##
     ## Extract body text/plain and text/html
-    ## 
+    ##
     body = dict()
     if 'body' in decoded_email:
         body['text_plain'] = list()
@@ -163,19 +164,29 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
         for b in decoded_email.get('body'):
             ## text/plain
             if b.get('content_type') == "text/plain":
-                body['text_plain'].append(b)                
+                body['text_plain'].append(b)
                 b['beautified_text'] = BeautifulSoup(
-                        b.get('content'), 'html.parser').prettify()
-                iocs['url'].extend(ep.get_uri_ondata(b.get('content')))
-            
+                    b.get('content'), 'html.parser').prettify()
+                for url in ep.get_uri_ondata(b.get('content')):
+                    iocs['url'].append({"data": url, "tag": ["body:text/plain"]})
+
             ## text/html
             elif b.get('content_type') == "text/html":
-                iocs['url'].extend(ep.get_uri_ondata(b.get('content')))
-               
-               ## Generate rendering image if option is enabled
-                if wkhtmltoimage.get('enable'):
+                for url in ep.get_uri_ondata(b.get('content')):
+                    iocs['url'].append({"data": url, "tag": ["body:text/html"]})
 
-                    img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
+                ## Generate rendering image if option is enabled
+                if wkhtmltoimage.get('enable'):
+                    try:
+                        img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
+                    except Exception as e:
+                        try:
+                            b["content"] = remove_html_imports(b["content"], e)
+                            img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
+                        except Exception as e:
+                            b[
+                                "content"] = '<html><body><div style="background-color:red; color:white; text-align: center;"><strong>WARNING:</strong> this page cannot be rendered because some imports failed</div></body></html>'
+                            img_file = convert_png(b.get('content'), 0, wkhtmltoimage.get('path'), "/tmp")
                     b['rendered_html'] = "data:{};base64,{}".format(
                         "image/png",
                         base64_image(img_file.get('img_path'),
@@ -184,13 +195,13 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
                     )
                     b['beautified_html'] = BeautifulSoup(
                         b.get('content'), 'html.parser').prettify()
-                
+
                 body['text_html'].append(b)
     result['body'] = body
 
     ##
     ## Extract Attachments
-    ## 
+    ##
     result['attachments'] = list()
     if 'attachment' in decoded_email.keys():
         for a in decoded_email.get('attachment'):
@@ -202,32 +213,38 @@ def parseEml(filepath, job_directory, wkhtmltoimage):
                 f.close()
                 a['raw'] = a.get('raw').decode('ascii')
             result['attachments'].append(a)
-            iocs['hash'].extend([{
+            iocs['hash'].append({
                 'hash': a.get('hash').get('sha256'),
-                'filename': a.get('filename')
-            }])
-    
+                'filename': a.get('filename'),
+                'tag': ["content-type:{}".format(a.get('content_header').get('content-type')[0].split(';')[0])]
+            })
+
     ##
     ## Extract IOCs
-    ## 
-    iocs['ip'].extend(decoded_email.get('header').get('received_ip', []))
-    iocs['domain'].extend(decoded_email.get('header').get('received_domain', []))
+    ##
+    for ip in decoded_email.get('header').get('received_ip', []):
+        iocs['ip'].append({"data": ip, "tag": ["header:Received"]})
+    for domain in decoded_email.get('header').get('received_domain', []):
+        iocs['domain'].append({"data": domain, "tag": ["header:Received"]})
     ### Email
-    for field in ['cc', 
+    for field in ['cc',
                   'bcc',
                   'delivered_to',
                   'received_foremail',
                   ]:
-        iocs['email'].extend(decoded_email.get('header').get(field, []))
-    iocs['email'].append(decoded_email.get('header').get('from', ''))
+        for email in decoded_email.get('header').get(field, []):
+            if field == "delivered_to":
+                iocs['email'].append({"data": email, "tag": ["header:To"]})
+            else:
+                iocs['email'].append({"data": email, "tag": ["header:{}".format(field.capitalize())]})
+    iocs['email'].append({'data': decoded_email.get('header').get('from', ''), "tag": ["header:From"]})
 
     result['iocs'] = iocs
 
     return result
 
 
-def convert_png(content: str, i, wkhtmltoimage_path:str, output_path: str):
-
+def convert_png(content: str, i, wkhtmltoimage_path: str, output_path: str):
     config = imgkit.config(
         wkhtmltoimage=wkhtmltoimage_path
     )
@@ -235,9 +252,9 @@ def convert_png(content: str, i, wkhtmltoimage_path:str, output_path: str):
                'encoding': 'UTF-8',
                'disable-javascript': '',
                'load-media-error-handling': 'ignore',
-                'load-error-handling':'ignore'
+               'load-error-handling': 'ignore'
                }
-    imgkit.from_string(content, 
+    imgkit.from_string(content,
                        "{}/{}.png".format(output_path, i),
                        options=options,
                        config=config
@@ -272,5 +289,22 @@ def base64_image(img_path, width):
     except Exception as e:
         return "No image"
 
+
+def remove_html_imports(html_str, txt):
+    """
+    Remove all import statements from the html string.
+    """
+    body_pattern = r'<body[^>]*>'
+    import_pattern = '\S+="https?:\/\/\S+"'
+    warning = '<div style="background-color:red; color:white; text-align: center;"><strong>WARNING:</strong> this page was modified for rendering because some imports failed</div>'
+    splitted_html = html_str.splitlines()
+    for index, line in enumerate(splitted_html):
+        if re.search(body_pattern, line):
+            splitted_html.insert(index + 1, warning)
+        sanitazed_line = re.sub(import_pattern, '', line)
+        splitted_html[index] = sanitazed_line
+    return "\r\n".join(splitted_html)
+
+
 if __name__ == '__main__':
     EmlParserAnalyzer().run()

From 7937ad93f1b1da2c325ef2062f9d7932f0760b79 Mon Sep 17 00:00:00 2001
From: migueldo <mail.deoliveiramiguel@gmail.com>
Date: Thu, 7 Jul 2022 18:37:18 +0200
Subject: [PATCH 2/2] removed sanitized rendering

---
 analyzers/EmlParser/EmlParser.json | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/analyzers/EmlParser/EmlParser.json b/analyzers/EmlParser/EmlParser.json
index 8ac46027e..1a5511815 100644
--- a/analyzers/EmlParser/EmlParser.json
+++ b/analyzers/EmlParser/EmlParser.json
@@ -24,14 +24,6 @@
             "multi": false,
             "required": true
         },
-        {
-            "name": "sanitized_rendering",
-            "description": "If wkhtmltoimage fails because some imports can't be loaded the rendering will be done without them. If disabled an error message will be displayed instead.",
-            "defaultValue": false,
-            "type": "boolean",
-            "multi": false,
-            "required": true
-        },
         {
             "name": "wkhtmltoimage_path",
             "description": "Path of wkhtmltoimage program on the system. This program is required to generate visualisation of the message as it seen in mail client program. If using Docker image, use default configuration.",