Skip to content

Commit

Permalink
Add caching
Browse files Browse the repository at this point in the history
  • Loading branch information
ninoseki committed Apr 12, 2018
1 parent 79dc4d1 commit 7832cdb
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 15 deletions.
19 changes: 18 additions & 1 deletion analyzers/URLhaus/URLhaus.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,22 @@
"version": "0.1.0",
"description": "Search domains, URLs or hashes on URLhaus",
"dataTypeList": ["domain", "url", "hash"],
"command": "URLhaus/URLhaus_analyzer.py"
"command": "URLhaus/URLhaus_analyzer.py",
"configurationItems": [
{
"name": "cache.duration",
"description": "Define the cache duration",
"type": "number",
"multi": false,
"required": true,
"defaultValue": 3600
},
{
"name": "cache.root",
"description": "Define the path to the stored data",
"type": "string",
"multi": false,
"required": false
}
]
}
53 changes: 39 additions & 14 deletions analyzers/URLhaus/URLhaus.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,54 @@
from requests_html import HTMLSession
import urllib
from diskcache import Cache
from requests_html import HTML
import requests


class URLhaus:
def __init__(self, query):
"""Simple client to query URLhaus by abuse.ch.
:param query: domain, url or hash.
:param cache_duration: Duration before refreshing the cache (in seconds).
Ignored if `cache_duration` is 0.
:param cache_root: Path where to store the cached file.
:type query: string
:type cache_duration: int
:type cache_root: str
"""

def __init__(self,
query,
cache_duration=3600,
cache_root="/tmp/cortex/URLhaus"):
self.URL = "https://urlhaus.abuse.ch/browse.php"
self.query = query
self.cache = None
if cache_duration > 0:
self.cache = Cache(cache_root)
self.cache_duration = cache_duration

def _get_raw_data(self):
try:
return self.cache[self.query.encode('utf-8')]
except(AttributeError, TypeError):
return self.fetch()
except KeyError:
self.cache.set(
self.query.encode('utf-8'),
self.fetch(),
expire=self.cache_duration)
return self.cache[self.query.encode('utf-8')]

def search(self):
res = self.fetch()
res = self._get_raw_data()
return self.parse(res)

def fetch(self):
session = HTMLSession()
return session.get(self.target_url())
payload = {"search": self.query}
return requests.get(self.URL, params=payload).text

def parse(self, res):
def parse(self, doc):
results = []
table = res.html.find("table.table", first=True)
html = HTML(html=doc)
table = html.find("table.table", first=True)
rows = table.find("tr")[1:]
for row in rows:
cols = row.find("td")
Expand All @@ -31,9 +62,3 @@ def parse(self, res):
"reporter": cols[5].text
})
return results

def target_url(self):
return "{}?{}".format(
self.URL,
urllib.parse.urlencode({"search": self.query})
)
3 changes: 3 additions & 0 deletions analyzers/URLhaus/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
cortexutils
diskcache
requests
requests-html

0 comments on commit 7832cdb

Please sign in to comment.