Skip to content

Commit

Permalink
Merge pull request #486 from CIRCL/crawler_v2
Browse files Browse the repository at this point in the history
Crawler v2 - Add cookiejar - use cookie to bypass login form
  • Loading branch information
Terrtia authored Apr 1, 2020
2 parents e3e543b + eea2e17 commit d72f28f
Show file tree
Hide file tree
Showing 26 changed files with 1,701 additions and 311 deletions.
11 changes: 6 additions & 5 deletions bin/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,23 +351,24 @@ def search_potential_source_domain(type_service, domain):
# get HAR files
default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har")
if default_crawler_har:
default_crawler_har = 1
default_crawler_har = True
else:
default_crawler_har = 0
default_crawler_har = False

# get PNG files
default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png")
if default_crawler_png:
default_crawler_png = 1
default_crawler_png = True
else:
default_crawler_png = 0
default_crawler_png = False

# Default crawler options
default_crawler_config = {'html': 1,
default_crawler_config = {'html': True,
'har': default_crawler_har,
'png': default_crawler_png,
'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"),
'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"),
'cookiejar_uuid': None,
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}

# Track launched crawler
Expand Down
21 changes: 21 additions & 0 deletions bin/lib/Screenshot.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import base64
import os
import sys
import redis

from hashlib import sha256
from io import BytesIO

sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
Expand Down Expand Up @@ -164,6 +166,25 @@ def get_screenshot_file_content(sha256_string):
file_content = BytesIO(f.read())
return file_content

# if force save, ignore max_size
def save_crawled_screeshot(b64_screenshot, max_size, f_save=False):
screenshot_size = (len(b64_screenshot)*3) /4
if screenshot_size < max_size or f_save:
image_content = base64.standard_b64decode(b64_screenshot.encode())
sha256_string = sha256(image_content).hexdigest()
filepath = get_screenshot_filepath(sha256_string)
if os.path.isfile(filepath):
#print('File already exist')
return sha256_string
# create dir
dirname = os.path.dirname(filepath)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filepath, 'wb') as f:
f.write(image_content)
return sha256_string
return False

def save_screenshot_file(sha256_string, io_content):
filepath = get_screenshot_filepath(sha256_string)
if os.path.isfile(filepath):
Expand Down
Loading

0 comments on commit d72f28f

Please sign in to comment.