Skip to content

Commit 0a97d60

Browse files
authored
Merge pull request #270 from CybercentreCanada/identify_bugs
Fixing bugs in regular expressions and code logic
2 parents 893edb5 + b114fe3 commit 0a97d60

File tree

2 files changed

+93
-99
lines changed

2 files changed

+93
-99
lines changed

assemblyline/common/identify.py

+31-31
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@
2424

2525
STRONG_INDICATORS = {
2626
'code/vbs': [
27-
re.compile(rb'(^|\n)On Error Resume Next'),
27+
re.compile(rb'(^|\n)On[ \t]+Error[ \t]+Resume[ \t]+Next'),
2828
re.compile(rb'(^|\n)(?:Private)?[ \t]*Sub[ \t]+\w+\(*'),
29-
re.compile(rb'(^|\n)End Module'),
29+
re.compile(rb'(^|\n)End[ \t]+Module'),
3030
re.compile(rb'(^|\n)ExecuteGlobal'),
31-
re.compile(rb'(^|\n)REM '),
31+
re.compile(rb'(^|\n)REM[ \t]+'),
3232
re.compile(rb'(ubound|lbound)\('),
3333
],
3434
'code/javascript': [
@@ -43,39 +43,39 @@
4343
'code/csharp': [
4444
re.compile(rb'(^|\n)[ \t]*namespace[ \t]+[\w.]+'),
4545
re.compile(rb'(^|\n)[ \t]*using[ \t]+[\w.]+;'),
46-
re.compile(rb'(^|\n)[ \t]*internal class '),
46+
re.compile(rb'(^|\n)[ \t]*internal[ \t]+class[ \t]+'),
4747
],
4848
'code/php': [
4949
re.compile(rb'(^|\n)<\?php'),
5050
re.compile(rb'namespace[ \t]+[\w.]+'),
51-
re.compile(rb'function[ \t]*\w+[ \t]*\(\$[^)]+\)[ \t]*{'),
51+
re.compile(rb'function[ \t]+\w+[ \t]*\([ \t]*\$[^)]+\)[ \t\n]*{'),
5252
re.compile(rb'\beval[ \t]*\('),
5353
],
5454
'code/c': [
55-
re.compile(rb'(^|\n)(static|typedef)?[ \t]*struct '),
55+
re.compile(rb'(^|\n)(static|typedef)?[ \t]+struct[ \t]+'),
5656
re.compile(rb'(^|\n)#include[ \t]*([<"])[\w./]+([>"])'),
57-
re.compile(rb'(^|\n)#(ifndef |define |endif |pragma )'),
57+
re.compile(rb'(^|\n)#(ifndef|define|endif|pragma)[ \t]+'),
5858
],
5959
'code/python': [
60-
re.compile(rb'(^|\n)[ \t]*if __name__[ \t]*==[ \t]*[\'\"]__main__[\'\"][ \t]*:'),
60+
re.compile(rb'(^|\n)[ \t]*if[ \t]+__name__[ \t]*==[ \t]*[\'\"]__main__[\'\"][ \t]*:'),
6161
re.compile(rb'(^|\n)[ \t]*from[ \t]+[\w.]+[ \t]+import[ \t]+[\w.*]+([ \t]+as \w+)?'),
6262
re.compile(rb'(^|\n)[ \t]*def[ \t]*\w+[ \t]*\([^)]*\)[ \t]*:'),
6363
],
6464
'code/rust': [
65-
re.compile(rb'(^|\n)(pub|priv)[ \t]*(struct |enum |impl |const )'),
66-
re.compile(rb'(^|\n)[ \t]*fn[ \t]*\w+[ \t]*\(&self'),
65+
re.compile(rb'(^|\n)(pub|priv)[ \t]+(struct|enum|impl|const)[ \t]+'),
66+
re.compile(rb'(^|\n)[ \t]*fn[ \t]+\w+[ \t]*\(&self'),
6767
re.compile(rb'(println!|panic!)'),
6868
],
6969
'code/lisp': [
70-
re.compile(rb'(^|\n)[ \t]*\((defmacro|defun|eval-when|in-package|list|export|defvar) '),
70+
re.compile(rb'(^|\n)[ \t]*\((defmacro|defun|eval-when|in-package|list|export|defvar)[ \t]+'),
7171
],
7272
'code/java': [
73-
re.compile(rb'(^|\n)[ \t]*public[ \t]+class[ \t]+\w+[ \t]+(extends[ \t]+\w+[ \t]+)?{'),
74-
re.compile(rb'(^|\n)[\w \t]+\([^)]+\)[ \t]+throws[ \t]+[\w, \t]+[ \t]+{'),
73+
re.compile(rb'(^|\n)[ \t]*public[ \t]+class[ \t]+\w+[ \t]*([ \t]+extends[ \t]+\w+[ \t]*)?{'),
74+
re.compile(rb'(^|\n)[\w \t]+\([^)]*\)[ \t]+throws[ \t]+\w+[ \t]*(,[ \t]*\w+[ \t]*)*{'),
7575
],
7676
'code/perl': [
77-
re.compile(rb'(^|\n)[ \t]*my[ \t]*\$\w+[ \t]*='),
78-
re.compile(rb'(^|\n)[ \t]*sub[ \t]*\w+[ \t]*{'),
77+
re.compile(rb'(^|\n)[ \t]*my[ \t]+\$\w+[ \t]*='),
78+
re.compile(rb'(^|\n)[ \t]*sub[ \t]+\w+[ \t]*{'),
7979
],
8080
'code/ruby': [
8181
re.compile(rb'(^|\n)[ \t]*require(_all)?[ \t]*\'[\w/]+\''),
@@ -101,16 +101,16 @@
101101
re.compile(rb'^From: ', re.MULTILINE),
102102
],
103103
'metadata/sysmon': [
104-
re.compile(rb'<Events>*'),
105-
re.compile(rb'<Event>*'),
104+
re.compile(rb'<Events>[^>]+'),
105+
re.compile(rb'<Event>[^>]+'),
106106
re.compile(rb'<\/Event>'),
107107
re.compile(rb'<\/Events>'),
108108
],
109109
'code/xml': [
110110
# Check if it has an xml declaration header
111111
re.compile(rb'^\s*<\?xml[^>]+\?>', re.DOTALL | re.MULTILINE),
112112
# Check if it begins and ends with <tag ... and </tag ...> (for informal xml usages)
113-
re.compile(rb'^\s*<(?P<open>[\w:]+) .+</(?P=open)[^>]+>\s*$', re.DOTALL),
113+
re.compile(rb'^\s*<(?P<open>[\w:]+).+</(?P=open)>\s*$', re.DOTALL),
114114
# Check if a tag has an xmlns attribute
115115
re.compile(rb'<[^>]+xmlns[:=][^>]+>', re.MULTILINE),
116116
],
@@ -123,7 +123,7 @@
123123
# Match one of the common Classes
124124
re.compile(rb'(-memberDefinition|-Name|-namespace|-passthru)'),
125125
# Match one of the common Methods
126-
re.compile(rb'(\.Get(String|Field|Type|Method)\()')
126+
re.compile(rb'\.Get(String|Field|Type|Method)\(')
127127
]
128128
}
129129
STRONG_SCORE = 15
@@ -140,18 +140,18 @@
140140
'code/jscript': [rb'new[ \t]+ActiveXObject\(', rb'Scripting\.Dictionary'],
141141
'code/pdfjs': [rb'xfa\.((resolve|create)Node|datasets|form)', rb'\.oneOfChild'],
142142
'code/vbs': [
143-
rb'(^|\n)*[ ]{0,1000}[\t]*(Dim |Sub |Loop |Attribute |End Sub|Function |End Function )',
143+
rb'(^|\n)*[ \t]{0,1000}((Dim|Sub|Loop|Attribute|Function|End[ \t]+Function)[ \t]+)|(End[ \t]+Sub)',
144144
b'CreateObject',
145145
b'WScript',
146146
b'window_onload',
147147
b'.SpawnInstance_',
148148
b'.Security_',
149149
b'WSH',
150150
],
151-
'code/csharp': [rb'(^|\n)(protected)?[ \t]*override'],
152-
'code/sql': [rb'(^|\n)(create |drop |select |returns |declare )'],
151+
'code/csharp': [rb'(^|\n)(protected[ \t]+)?[ \t]*override'],
152+
'code/sql': [rb'(^|\n)(create|drop|select|returns|declare)[ \t]+'],
153153
'code/php': [rb'\$this\->'],
154-
'code/c': [rb'(^|\n)(const char \w+;|extern |uint(8|16|32)_t )'],
154+
'code/c': [rb'(^|\n)(const[ \t]+char[ \t]+\w+;|extern[ \t]+|uint(8|16|32)_t[ \t]+)'],
155155
'code/python': [b'try:', b'except:', b'else:'],
156156
'code/java': [rb'(^|\n)[ \t]*package[ \t]+[\w\.]+;'],
157157
'code/perl': [rb'(^|\n)[ \t]*package[ \t]+[\w\.]+;', b'@_'],
@@ -420,7 +420,7 @@
420420

421421

422422
# Translate the match object into a sub-type label.
423-
def subtype(label: str) -> str:
423+
def _subtype(label: str) -> str:
424424
for entry in sl_patterns:
425425
if entry[1].search(label): # pylint: disable=E1101
426426
return entry[0]
@@ -509,7 +509,7 @@ def ident(buf, length: int, path) -> Dict:
509509
# ... keep highest precedence (lowest index) match.
510510
if index < minimum:
511511
minimum = index
512-
sl_tag = subtype(label)
512+
sl_tag = _subtype(label)
513513

514514
# If a label does match, take the best from that label
515515
# Further labels from magic are probably terrible
@@ -567,7 +567,7 @@ def _differentiate(lang: str, scores_map: Dict) -> str:
567567

568568

569569
# Pass a filepath and this will return the guessed language in the AL tag format.
570-
def guess_language(path: str) -> Tuple[str, Union[str, int]]:
570+
def _guess_language(path: str) -> Tuple[str, Union[str, int]]:
571571
file_length = os.path.getsize(path)
572572
with open(path, 'rb') as fh:
573573
if file_length > 131070:
@@ -709,12 +709,12 @@ def dos_ident(path: str) -> str:
709709
try:
710710
with open(path, "rb") as fh:
711711
file_header = fh.read(0x40)
712-
if file_header[0:2] != "MZ":
712+
if file_header[0:2] != b"MZ":
713713
raise ValueError()
714714

715715
header_pos, = struct.unpack("<I", file_header[-4:])
716716
fh.seek(header_pos)
717-
if fh.read(4) != "PE\x00\x00":
717+
if fh.read(4) != b"PE\x00\x00":
718718
raise ValueError()
719719
machine_id, = struct.unpack("<H", fh.read(2))
720720
if machine_id == 0x014c:
@@ -749,7 +749,7 @@ def fileinfo(path: str) -> Dict:
749749
with open(path, 'rb') as fh:
750750
buf = fh.read()
751751
buflen = len(buf)
752-
data.update(ident(buf, buflen))
752+
data.update(ident(buf, buflen, path))
753753
data['ssdeep'] = ssdeep_from_file(path) if ssdeep_from_file else ''
754754

755755
# When data is parsed from a cart file we trust its metatdata and can skip the recognition test later
@@ -767,7 +767,7 @@ def fileinfo(path: str) -> Dict:
767767
# but don't commit to it being a zip if it can't be extracted
768768
data['type'] = zip_ident(path, data['type'])
769769
elif data['type'] == 'unknown':
770-
data['type'], _ = guess_language(path)
770+
data['type'], _ = _guess_language(path)
771771
elif data['type'] == 'archive/cart':
772772
data['type'] = cart_ident(path)
773773
cart_metadata_set = True
@@ -777,7 +777,7 @@ def fileinfo(path: str) -> Dict:
777777
elif data['type'] == 'code/html':
778778
# Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs
779779
# If both conditions are met, it's fair to say that the file is an .hta
780-
lang, _ = guess_language(path)
780+
lang, _ = _guess_language(path)
781781
if lang in ["code/javascript", "code/vbs"]:
782782
data['type'] = 'code/hta'
783783

0 commit comments

Comments
 (0)