Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing bugs in regular expressions and code logic (DEV) #269

Merged
merged 3 commits into from
Jun 10, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 31 additions & 31 deletions assemblyline/common/identify.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@

STRONG_INDICATORS = {
'code/vbs': [
re.compile(rb'(^|\n)On Error Resume Next'),
re.compile(rb'(^|\n)On[ \t]+Error[ \t]+Resume[ \t]+Next'),
re.compile(rb'(^|\n)(?:Private)?[ \t]*Sub[ \t]+\w+\(*'),
re.compile(rb'(^|\n)End Module'),
re.compile(rb'(^|\n)End[ \t]+Module'),
re.compile(rb'(^|\n)ExecuteGlobal'),
re.compile(rb'(^|\n)REM '),
re.compile(rb'(^|\n)REM[ \t]+'),
re.compile(rb'(ubound|lbound)\('),
],
'code/javascript': [
Expand All @@ -43,39 +43,39 @@
'code/csharp': [
re.compile(rb'(^|\n)[ \t]*namespace[ \t]+[\w.]+'),
re.compile(rb'(^|\n)[ \t]*using[ \t]+[\w.]+;'),
re.compile(rb'(^|\n)[ \t]*internal class '),
re.compile(rb'(^|\n)[ \t]*internal[ \t]+class[ \t]+'),
],
'code/php': [
re.compile(rb'(^|\n)<\?php'),
re.compile(rb'namespace[ \t]+[\w.]+'),
re.compile(rb'function[ \t]*\w+[ \t]*\(\$[^)]+\)[ \t]*{'),
re.compile(rb'function[ \t]+\w+[ \t]*\([ \t]*\$[^)]+\)[ \t]*{'),
re.compile(rb'\beval[ \t]*\('),
],
'code/c': [
re.compile(rb'(^|\n)(static|typedef)?[ \t]*struct '),
re.compile(rb'(^|\n)(static|typedef)?[ \t]+struct[ \t]+'),
re.compile(rb'(^|\n)#include[ \t]*([<"])[\w./]+([>"])'),
re.compile(rb'(^|\n)#(ifndef |define |endif |pragma )'),
re.compile(rb'(^|\n)#(ifndef|define|endif|pragma)[ \t]+'),
],
'code/python': [
re.compile(rb'(^|\n)[ \t]*if __name__[ \t]*==[ \t]*[\'\"]__main__[\'\"][ \t]*:'),
re.compile(rb'(^|\n)[ \t]*if[ \t]+__name__[ \t]*==[ \t]*[\'\"]__main__[\'\"][ \t]*:'),
re.compile(rb'(^|\n)[ \t]*from[ \t]+[\w.]+[ \t]+import[ \t]+[\w.*]+([ \t]+as \w+)?'),
re.compile(rb'(^|\n)[ \t]*def[ \t]*\w+[ \t]*\([^)]*\)[ \t]*:'),
],
'code/rust': [
re.compile(rb'(^|\n)(pub|priv)[ \t]*(struct |enum |impl |const )'),
re.compile(rb'(^|\n)[ \t]*fn[ \t]*\w+[ \t]*\(&self'),
re.compile(rb'(^|\n)(pub|priv)[ \t]+(struct|enum|impl|const)[ \t]+'),
re.compile(rb'(^|\n)[ \t]*fn[ \t]+\w+[ \t]*\(&self'),
re.compile(rb'(println!|panic!)'),
],
'code/lisp': [
re.compile(rb'(^|\n)[ \t]*\((defmacro|defun|eval-when|in-package|list|export|defvar) '),
re.compile(rb'(^|\n)[ \t]*\((defmacro|defun|eval-when|in-package|list|export|defvar)[ \t]+'),
],
'code/java': [
re.compile(rb'(^|\n)[ \t]*public[ \t]+class[ \t]+\w+[ \t]+(extends[ \t]+\w+[ \t]+)?{'),
re.compile(rb'(^|\n)[\w \t]+\([^)]+\)[ \t]+throws[ \t]+[\w, \t]+[ \t]+{'),
re.compile(rb'(^|\n)[ \t]*public[ \t]+class[ \t]+\w+[ \t]*([ \t]+extends[ \t]+\w+[ \t]*)?{'),
re.compile(rb'(^|\n)[\w \t]+\([^)]*\)[ \t]+throws[ \t]+\w+[ \t]*(,[ \t]*\w+[ \t]*)*{'),
],
'code/perl': [
re.compile(rb'(^|\n)[ \t]*my[ \t]*\$\w+[ \t]*='),
re.compile(rb'(^|\n)[ \t]*sub[ \t]*\w+[ \t]*{'),
re.compile(rb'(^|\n)[ \t]*my[ \t]+\$\w+[ \t]*='),
re.compile(rb'(^|\n)[ \t]*sub[ \t]+\w+[ \t]*{'),
],
'code/ruby': [
re.compile(rb'(^|\n)[ \t]*require(_all)?[ \t]*\'[\w/]+\''),
Expand All @@ -101,16 +101,16 @@
re.compile(rb'^From: ', re.MULTILINE),
],
'metadata/sysmon': [
re.compile(rb'<Events>*'),
re.compile(rb'<Event>*'),
re.compile(rb'<Events>[^>]+'),
re.compile(rb'<Event>[^>]+'),
re.compile(rb'<\/Event>'),
re.compile(rb'<\/Events>'),
],
'code/xml': [
# Check if it has an xml declaration header
re.compile(rb'^\s*<\?xml[^>]+\?>', re.DOTALL | re.MULTILINE),
# Check if it begins and ends with <tag ... and </tag ...> (for informal xml usages)
re.compile(rb'^\s*<(?P<open>[\w:]+) .+</(?P=open)[^>]+>\s*$', re.DOTALL),
re.compile(rb'^\s*<(?P<open>[\w:]+).+</(?P=open)>\s*$', re.DOTALL),
# Check if a tag has an xmlns attribute
re.compile(rb'<[^>]+xmlns[:=][^>]+>', re.MULTILINE),
],
Expand All @@ -123,7 +123,7 @@
# Match one of the common Classes
re.compile(rb'(-memberDefinition|-Name|-namespace|-passthru)'),
# Match one of the common Methods
re.compile(rb'(\.Get(String|Field|Type|Method)\()')
re.compile(rb'\.Get(String|Field|Type|Method)\(')
]
}
STRONG_SCORE = 15
Expand All @@ -140,18 +140,18 @@
'code/jscript': [rb'new[ \t]+ActiveXObject\(', rb'Scripting\.Dictionary'],
'code/pdfjs': [rb'xfa\.((resolve|create)Node|datasets|form)', rb'\.oneOfChild'],
'code/vbs': [
rb'(^|\n)*[ ]{0,1000}[\t]*(Dim |Sub |Loop |Attribute |End Sub|Function |End Function )',
rb'(^|\n)*[ \t]{0,1000}((Dim|Sub|Loop|Attribute|Function|End[ \t]+Function)[ \t]+)|(End[ \t]+Sub)',
b'CreateObject',
b'WScript',
b'window_onload',
b'.SpawnInstance_',
b'.Security_',
b'WSH',
],
'code/csharp': [rb'(^|\n)(protected)?[ \t]*override'],
'code/sql': [rb'(^|\n)(create |drop |select |returns |declare )'],
'code/csharp': [rb'(^|\n)(protected[ \t]+)?[ \t]*override'],
'code/sql': [rb'(^|\n)(create|drop|select|returns|declare)[ \t]+'],
'code/php': [rb'\$this\->'],
'code/c': [rb'(^|\n)(const char \w+;|extern |uint(8|16|32)_t )'],
'code/c': [rb'(^|\n)(const[ \t]+char[ \t]+\w+;|extern[ \t]+|uint(8|16|32)_t[ \t]+)'],
'code/python': [b'try:', b'except:', b'else:'],
'code/java': [rb'(^|\n)[ \t]*package[ \t]+[\w\.]+;'],
'code/perl': [rb'(^|\n)[ \t]*package[ \t]+[\w\.]+;', b'@_'],
Expand Down Expand Up @@ -420,7 +420,7 @@


# Translate the match object into a sub-type label.
def subtype(label: str) -> str:
def _subtype(label: str) -> str:
for entry in sl_patterns:
if entry[1].search(label): # pylint: disable=E1101
return entry[0]
Expand Down Expand Up @@ -509,7 +509,7 @@ def ident(buf, length: int, path) -> Dict:
# ... keep highest precedence (lowest index) match.
if index < minimum:
minimum = index
sl_tag = subtype(label)
sl_tag = _subtype(label)

# If a label does match, take the best from that label
# Further labels from magic are probably terrible
Expand Down Expand Up @@ -567,7 +567,7 @@ def _differentiate(lang: str, scores_map: Dict) -> str:


# Pass a filepath and this will return the guessed language in the AL tag format.
def guess_language(path: str) -> Tuple[str, Union[str, int]]:
def _guess_language(path: str) -> Tuple[str, Union[str, int]]:
file_length = os.path.getsize(path)
with open(path, 'rb') as fh:
if file_length > 131070:
Expand Down Expand Up @@ -709,12 +709,12 @@ def dos_ident(path: str) -> str:
try:
with open(path, "rb") as fh:
file_header = fh.read(0x40)
if file_header[0:2] != "MZ":
if file_header[0:2] != b"MZ":
raise ValueError()

header_pos, = struct.unpack("<I", file_header[-4:])
fh.seek(header_pos)
if fh.read(4) != "PE\x00\x00":
if fh.read(4) != b"PE\x00\x00":
raise ValueError()
machine_id, = struct.unpack("<H", fh.read(2))
if machine_id == 0x014c:
Expand Down Expand Up @@ -749,7 +749,7 @@ def fileinfo(path: str) -> Dict:
with open(path, 'rb') as fh:
buf = fh.read()
buflen = len(buf)
data.update(ident(buf, buflen))
data.update(ident(buf, buflen, path))
data['ssdeep'] = ssdeep_from_file(path) if ssdeep_from_file else ''

# When data is parsed from a cart file we trust its metatdata and can skip the recognition test later
Expand All @@ -767,7 +767,7 @@ def fileinfo(path: str) -> Dict:
# but don't commit to it being a zip if it can't be extracted
data['type'] = zip_ident(path, data['type'])
elif data['type'] == 'unknown':
data['type'], _ = guess_language(path)
data['type'], _ = _guess_language(path)
elif data['type'] == 'archive/cart':
data['type'] = cart_ident(path)
cart_metadata_set = True
Expand All @@ -777,7 +777,7 @@ def fileinfo(path: str) -> Dict:
elif data['type'] == 'code/html':
# Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs
# If both conditions are met, it's fair to say that the file is an .hta
lang, _ = guess_language(path)
lang, _ = _guess_language(path)
if lang in ["code/javascript", "code/vbs"]:
data['type'] = 'code/hta'

Expand Down
Loading