@@ -9,6 +9,8 @@ pub enum Case {
9
9
#[ derive( Debug , Clone , PartialEq , Eq , Hash ) ]
10
10
pub struct ParserBuilder {
11
11
ignore_hex : bool ,
12
+ leading_digits : bool ,
13
+ leading_chars : String ,
12
14
include_digits : bool ,
13
15
include_chars : String ,
14
16
}
@@ -23,6 +25,16 @@ impl ParserBuilder {
23
25
self
24
26
}
25
27
28
+ pub fn leading_digits ( & mut self , yes : bool ) -> & mut Self {
29
+ self . leading_digits = yes;
30
+ self
31
+ }
32
+
33
+ pub fn leading_chars ( & mut self , chars : String ) -> & mut Self {
34
+ self . leading_chars = chars;
35
+ self
36
+ }
37
+
26
38
pub fn include_digits ( & mut self , yes : bool ) -> & mut Self {
27
39
self . include_digits = yes;
28
40
self
@@ -34,31 +46,44 @@ impl ParserBuilder {
34
46
}
35
47
36
48
pub fn build ( & self ) -> Parser {
37
- let mut pattern = r#"\b(\p{Alphabetic}"# . to_owned ( ) ;
38
- if self . include_digits {
39
- pattern. push_str ( r#"|\d"# ) ;
40
- }
41
- for grapheme in
42
- unicode_segmentation:: UnicodeSegmentation :: graphemes ( self . include_chars . as_str ( ) , true )
43
- {
44
- let escaped = regex:: escape ( & grapheme) ;
45
- pattern. push_str ( & format ! ( "|{}" , escaped) ) ;
46
- }
47
- pattern. push_str ( r#")+\b"# ) ;
49
+ let mut pattern = r#"\b("# . to_owned ( ) ;
50
+ Self :: push_pattern ( & mut pattern, self . leading_digits , & self . leading_chars ) ;
51
+ Self :: push_pattern ( & mut pattern, self . include_digits , & self . include_chars ) ;
52
+ pattern. push_str ( r#"*)\b"# ) ;
53
+ let pattern = dbg ! ( pattern) ;
54
+
48
55
let words_str = regex:: Regex :: new ( & pattern) . unwrap ( ) ;
49
56
let words_bytes = regex:: bytes:: Regex :: new ( & pattern) . unwrap ( ) ;
57
+
50
58
Parser {
51
59
words_str,
52
60
words_bytes,
53
- ignore_hex : self . ignore_hex && self . include_digits ,
61
+ // `leading_digits` let's us bypass the regexes since you can't have a decimal or
62
+ // hexadecimal number without a leading digit.
63
+ ignore_numbers : self . leading_digits ,
64
+ ignore_hex : self . ignore_hex && self . leading_digits ,
54
65
}
55
66
}
67
+
68
+ fn push_pattern ( pattern : & mut String , digits : bool , chars : & str ) {
69
+ pattern. push_str ( r#"(\p{Alphabetic}"# ) ;
70
+ if digits {
71
+ pattern. push_str ( r#"|\d"# ) ;
72
+ }
73
+ for grapheme in unicode_segmentation:: UnicodeSegmentation :: graphemes ( chars, true ) {
74
+ let escaped = regex:: escape ( & grapheme) ;
75
+ pattern. push_str ( & format ! ( "|{}" , escaped) ) ;
76
+ }
77
+ pattern. push_str ( r#")"# ) ;
78
+ }
56
79
}
57
80
58
81
impl Default for ParserBuilder {
59
82
fn default ( ) -> Self {
60
83
Self {
61
84
ignore_hex : true ,
85
+ leading_digits : false ,
86
+ leading_chars : "_" . to_owned ( ) ,
62
87
include_digits : true ,
63
88
include_chars : "_'" . to_owned ( ) ,
64
89
}
@@ -69,6 +94,7 @@ impl Default for ParserBuilder {
69
94
pub struct Parser {
70
95
words_str : regex:: Regex ,
71
96
words_bytes : regex:: bytes:: Regex ,
97
+ ignore_numbers : bool ,
72
98
ignore_hex : bool ,
73
99
}
74
100
@@ -95,12 +121,12 @@ impl Parser {
95
121
}
96
122
97
123
fn accept ( & self , contents : & [ u8 ] ) -> bool {
98
- if is_number ( contents) {
124
+ if self . ignore_numbers && is_number ( contents) {
99
125
return false ;
100
- } ;
126
+ }
101
127
102
- if self . ignore_hex {
103
- return ! is_hex ( contents ) ;
128
+ if self . ignore_hex && is_hex ( contents ) {
129
+ return false ;
104
130
}
105
131
106
132
true
@@ -455,7 +481,10 @@ mod test {
455
481
456
482
#[ test]
457
483
fn tokenize_ignore_hex_disabled ( ) {
458
- let parser = ParserBuilder :: new ( ) . ignore_hex ( false ) . build ( ) ;
484
+ let parser = ParserBuilder :: new ( )
485
+ . ignore_hex ( false )
486
+ . leading_digits ( true )
487
+ . build ( ) ;
459
488
460
489
let input = "Hello 0xDEADBEEF World" ;
461
490
let expected: Vec < Identifier > = vec ! [
0 commit comments