24
24
25
25
STRONG_INDICATORS = {
26
26
'code/vbs' : [
27
- re .compile (rb'(^|\n)On Error Resume Next' ),
27
+ re .compile (rb'(^|\n)On[ \t]+ Error[ \t]+ Resume[ \t]+ Next' ),
28
28
re .compile (rb'(^|\n)(?:Private)?[ \t]*Sub[ \t]+\w+\(*' ),
29
- re .compile (rb'(^|\n)End Module' ),
29
+ re .compile (rb'(^|\n)End[ \t]+ Module' ),
30
30
re .compile (rb'(^|\n)ExecuteGlobal' ),
31
- re .compile (rb'(^|\n)REM ' ),
31
+ re .compile (rb'(^|\n)REM[ \t]+ ' ),
32
32
re .compile (rb'(ubound|lbound)\(' ),
33
33
],
34
34
'code/javascript' : [
43
43
'code/csharp' : [
44
44
re .compile (rb'(^|\n)[ \t]*namespace[ \t]+[\w.]+' ),
45
45
re .compile (rb'(^|\n)[ \t]*using[ \t]+[\w.]+;' ),
46
- re .compile (rb'(^|\n)[ \t]*internal class ' ),
46
+ re .compile (rb'(^|\n)[ \t]*internal[ \t]+ class[ \t]+ ' ),
47
47
],
48
48
'code/php' : [
49
49
re .compile (rb'(^|\n)<\?php' ),
50
50
re .compile (rb'namespace[ \t]+[\w.]+' ),
51
- re .compile (rb'function[ \t]* \w+[ \t]*\(\ $[^)]+\)[ \t]*{' ),
51
+ re .compile (rb'function[ \t]+ \w+[ \t]*\([ \t]*\ $[^)]+\)[ \t\n ]*{' ),
52
52
re .compile (rb'\beval[ \t]*\(' ),
53
53
],
54
54
'code/c' : [
55
- re .compile (rb'(^|\n)(static|typedef)?[ \t]* struct ' ),
55
+ re .compile (rb'(^|\n)(static|typedef)?[ \t]+ struct[ \t]+ ' ),
56
56
re .compile (rb'(^|\n)#include[ \t]*([<"])[\w./]+([>"])' ),
57
- re .compile (rb'(^|\n)#(ifndef |define |endif |pragma ) ' ),
57
+ re .compile (rb'(^|\n)#(ifndef|define|endif|pragma)[ \t]+ ' ),
58
58
],
59
59
'code/python' : [
60
- re .compile (rb'(^|\n)[ \t]*if __name__[ \t]*==[ \t]*[\'\"]__main__[\'\"][ \t]*:' ),
60
+ re .compile (rb'(^|\n)[ \t]*if[ \t]+ __name__[ \t]*==[ \t]*[\'\"]__main__[\'\"][ \t]*:' ),
61
61
re .compile (rb'(^|\n)[ \t]*from[ \t]+[\w.]+[ \t]+import[ \t]+[\w.*]+([ \t]+as \w+)?' ),
62
62
re .compile (rb'(^|\n)[ \t]*def[ \t]*\w+[ \t]*\([^)]*\)[ \t]*:' ),
63
63
],
64
64
'code/rust' : [
65
- re .compile (rb'(^|\n)(pub|priv)[ \t]* (struct |enum |impl |const ) ' ),
66
- re .compile (rb'(^|\n)[ \t]*fn[ \t]* \w+[ \t]*\(&self' ),
65
+ re .compile (rb'(^|\n)(pub|priv)[ \t]+ (struct|enum|impl|const)[ \t]+ ' ),
66
+ re .compile (rb'(^|\n)[ \t]*fn[ \t]+ \w+[ \t]*\(&self' ),
67
67
re .compile (rb'(println!|panic!)' ),
68
68
],
69
69
'code/lisp' : [
70
- re .compile (rb'(^|\n)[ \t]*\((defmacro|defun|eval-when|in-package|list|export|defvar) ' ),
70
+ re .compile (rb'(^|\n)[ \t]*\((defmacro|defun|eval-when|in-package|list|export|defvar)[ \t]+ ' ),
71
71
],
72
72
'code/java' : [
73
- re .compile (rb'(^|\n)[ \t]*public[ \t]+class[ \t]+\w+[ \t]+( extends[ \t]+\w+[ \t]+ )?{' ),
74
- re .compile (rb'(^|\n)[\w \t]+\([^)]+ \)[ \t]+throws[ \t]+[\w, \t]+[ \t]+ {' ),
73
+ re .compile (rb'(^|\n)[ \t]*public[ \t]+class[ \t]+\w+[ \t]*([ \t]+ extends[ \t]+\w+[ \t]* )?{' ),
74
+ re .compile (rb'(^|\n)[\w \t]+\([^)]* \)[ \t]+throws[ \t]+\w+[ \t]*(,[ \t]*\w +[ \t]*)* {' ),
75
75
],
76
76
'code/perl' : [
77
- re .compile (rb'(^|\n)[ \t]*my[ \t]* \$\w+[ \t]*=' ),
78
- re .compile (rb'(^|\n)[ \t]*sub[ \t]* \w+[ \t]*{' ),
77
+ re .compile (rb'(^|\n)[ \t]*my[ \t]+ \$\w+[ \t]*=' ),
78
+ re .compile (rb'(^|\n)[ \t]*sub[ \t]+ \w+[ \t]*{' ),
79
79
],
80
80
'code/ruby' : [
81
81
re .compile (rb'(^|\n)[ \t]*require(_all)?[ \t]*\'[\w/]+\'' ),
101
101
re .compile (rb'^From: ' , re .MULTILINE ),
102
102
],
103
103
'metadata/sysmon' : [
104
- re .compile (rb'<Events>* ' ),
105
- re .compile (rb'<Event>* ' ),
104
+ re .compile (rb'<Events>[^>]+ ' ),
105
+ re .compile (rb'<Event>[^>]+ ' ),
106
106
re .compile (rb'<\/Event>' ),
107
107
re .compile (rb'<\/Events>' ),
108
108
],
109
109
'code/xml' : [
110
110
# Check if it has an xml declaration header
111
111
re .compile (rb'^\s*<\?xml[^>]+\?>' , re .DOTALL | re .MULTILINE ),
112
112
# Check if it begins and ends with <tag ... and </tag ...> (for informal xml usages)
113
- re .compile (rb'^\s*<(?P<open>[\w:]+) .+</(?P=open)[^>]+ >\s*$' , re .DOTALL ),
113
+ re .compile (rb'^\s*<(?P<open>[\w:]+).+</(?P=open)>\s*$' , re .DOTALL ),
114
114
# Check if a tag has an xmlns attribute
115
115
re .compile (rb'<[^>]+xmlns[:=][^>]+>' , re .MULTILINE ),
116
116
],
123
123
# Match one of the common Classes
124
124
re .compile (rb'(-memberDefinition|-Name|-namespace|-passthru)' ),
125
125
# Match one of the common Methods
126
- re .compile (rb'( \.Get(String|Field|Type|Method)\() ' )
126
+ re .compile (rb'\.Get(String|Field|Type|Method)\(' )
127
127
]
128
128
}
129
129
STRONG_SCORE = 15
140
140
'code/jscript' : [rb'new[ \t]+ActiveXObject\(' , rb'Scripting\.Dictionary' ],
141
141
'code/pdfjs' : [rb'xfa\.((resolve|create)Node|datasets|form)' , rb'\.oneOfChild' ],
142
142
'code/vbs' : [
143
- rb'(^|\n)*[ ]{0,1000}[\t]*( Dim |Sub |Loop |Attribute | End Sub| Function | End Function )' ,
143
+ rb'(^|\n)*[ \t ]{0,1000}(( Dim|Sub|Loop|Attribute|Function| End[ \t]+ Function)[ \t]+)|( End[ \t]+Sub )' ,
144
144
b'CreateObject' ,
145
145
b'WScript' ,
146
146
b'window_onload' ,
147
147
b'.SpawnInstance_' ,
148
148
b'.Security_' ,
149
149
b'WSH' ,
150
150
],
151
- 'code/csharp' : [rb'(^|\n)(protected)?[ \t]*override' ],
152
- 'code/sql' : [rb'(^|\n)(create |drop |select |returns |declare ) ' ],
151
+ 'code/csharp' : [rb'(^|\n)(protected[ \t]+ )?[ \t]*override' ],
152
+ 'code/sql' : [rb'(^|\n)(create|drop|select|returns|declare)[ \t]+ ' ],
153
153
'code/php' : [rb'\$this\->' ],
154
- 'code/c' : [rb'(^|\n)(const char \ w+;|extern |uint(8|16|32)_t )' ],
154
+ 'code/c' : [rb'(^|\n)(const[ \t]+ char[ \t]+\ w+;|extern[ \t]+ |uint(8|16|32)_t[ \t]+ )' ],
155
155
'code/python' : [b'try:' , b'except:' , b'else:' ],
156
156
'code/java' : [rb'(^|\n)[ \t]*package[ \t]+[\w\.]+;' ],
157
157
'code/perl' : [rb'(^|\n)[ \t]*package[ \t]+[\w\.]+;' , b'@_' ],
420
420
421
421
422
422
# Translate the match object into a sub-type label.
423
- def subtype (label : str ) -> str :
423
+ def _subtype (label : str ) -> str :
424
424
for entry in sl_patterns :
425
425
if entry [1 ].search (label ): # pylint: disable=E1101
426
426
return entry [0 ]
@@ -509,7 +509,7 @@ def ident(buf, length: int, path) -> Dict:
509
509
# ... keep highest precedence (lowest index) match.
510
510
if index < minimum :
511
511
minimum = index
512
- sl_tag = subtype (label )
512
+ sl_tag = _subtype (label )
513
513
514
514
# If a label does match, take the best from that label
515
515
# Further labels from magic are probably terrible
@@ -567,7 +567,7 @@ def _differentiate(lang: str, scores_map: Dict) -> str:
567
567
568
568
569
569
# Pass a filepath and this will return the guessed language in the AL tag format.
570
- def guess_language (path : str ) -> Tuple [str , Union [str , int ]]:
570
+ def _guess_language (path : str ) -> Tuple [str , Union [str , int ]]:
571
571
file_length = os .path .getsize (path )
572
572
with open (path , 'rb' ) as fh :
573
573
if file_length > 131070 :
@@ -709,12 +709,12 @@ def dos_ident(path: str) -> str:
709
709
try :
710
710
with open (path , "rb" ) as fh :
711
711
file_header = fh .read (0x40 )
712
- if file_header [0 :2 ] != "MZ" :
712
+ if file_header [0 :2 ] != b "MZ" :
713
713
raise ValueError ()
714
714
715
715
header_pos , = struct .unpack ("<I" , file_header [- 4 :])
716
716
fh .seek (header_pos )
717
- if fh .read (4 ) != "PE\x00 \x00 " :
717
+ if fh .read (4 ) != b "PE\x00 \x00 " :
718
718
raise ValueError ()
719
719
machine_id , = struct .unpack ("<H" , fh .read (2 ))
720
720
if machine_id == 0x014c :
@@ -749,7 +749,7 @@ def fileinfo(path: str) -> Dict:
749
749
with open (path , 'rb' ) as fh :
750
750
buf = fh .read ()
751
751
buflen = len (buf )
752
- data .update (ident (buf , buflen ))
752
+ data .update (ident (buf , buflen , path ))
753
753
data ['ssdeep' ] = ssdeep_from_file (path ) if ssdeep_from_file else ''
754
754
755
755
# When data is parsed from a cart file we trust its metatdata and can skip the recognition test later
@@ -767,7 +767,7 @@ def fileinfo(path: str) -> Dict:
767
767
# but don't commit to it being a zip if it can't be extracted
768
768
data ['type' ] = zip_ident (path , data ['type' ])
769
769
elif data ['type' ] == 'unknown' :
770
- data ['type' ], _ = guess_language (path )
770
+ data ['type' ], _ = _guess_language (path )
771
771
elif data ['type' ] == 'archive/cart' :
772
772
data ['type' ] = cart_ident (path )
773
773
cart_metadata_set = True
@@ -777,7 +777,7 @@ def fileinfo(path: str) -> Dict:
777
777
elif data ['type' ] == 'code/html' :
778
778
# Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs
779
779
# If both conditions are met, it's fair to say that the file is an .hta
780
- lang , _ = guess_language (path )
780
+ lang , _ = _guess_language (path )
781
781
if lang in ["code/javascript" , "code/vbs" ]:
782
782
data ['type' ] = 'code/hta'
783
783
0 commit comments