Skip to content

Commit 107308a

Browse files
committed
perf: Use standard identifier rules to avoid doing umber checks
1 parent ed00f3c commit 107308a

File tree

3 files changed

+88
-20
lines changed

3 files changed

+88
-20
lines changed

src/config.rs

+40-3
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,27 @@ pub trait FileSource {
5858
None
5959
}
6060

61-
/// Do not check identifiers that appear to be hexadecimal values
61+
/// Do not check identifiers that appear to be hexadecimal values.
6262
fn ignore_hex(&self) -> Option<bool> {
6363
None
6464
}
6565

66-
/// Allow identifiers to include digits, in addition to letters
66+
/// Allow identifiers to start with digits, in addition to letters.
67+
fn identifier_leading_digits(&self) -> Option<bool> {
68+
None
69+
}
70+
71+
/// Allow identifiers to start with one of these characters.
72+
fn identifier_leading_chars(&self) -> Option<&str> {
73+
None
74+
}
75+
76+
/// Allow identifiers to include digits, in addition to letters.
6777
fn identifier_include_digits(&self) -> Option<bool> {
6878
None
6979
}
7080

71-
/// Specify additional characters to be included in identifiers
81+
/// Allow identifiers to include these characters.
7282
fn identifier_include_chars(&self) -> Option<&str> {
7383
None
7484
}
@@ -233,6 +243,8 @@ pub struct FileConfig {
233243
pub check_filename: Option<bool>,
234244
pub check_file: Option<bool>,
235245
pub ignore_hex: Option<bool>,
246+
pub identifier_leading_digits: Option<bool>,
247+
pub identifier_leading_chars: Option<String>,
236248
pub identifier_include_digits: Option<bool>,
237249
pub identifier_include_chars: Option<String>,
238250
}
@@ -248,6 +260,12 @@ impl FileConfig {
248260
if let Some(source) = source.ignore_hex() {
249261
self.ignore_hex = Some(source);
250262
}
263+
if let Some(source) = source.identifier_leading_digits() {
264+
self.identifier_leading_digits = Some(source);
265+
}
266+
if let Some(source) = source.identifier_leading_chars() {
267+
self.identifier_leading_chars = Some(source.to_owned());
268+
}
251269
if let Some(source) = source.identifier_include_digits() {
252270
self.identifier_include_digits = Some(source);
253271
}
@@ -268,6 +286,17 @@ impl FileConfig {
268286
self.ignore_hex.unwrap_or(true)
269287
}
270288

289+
pub fn identifier_leading_digits(&self) -> bool {
290+
self.identifier_leading_digits.unwrap_or(false)
291+
}
292+
293+
pub fn identifier_leading_chars(&self) -> &str {
294+
self.identifier_leading_chars
295+
.as_ref()
296+
.map(|s| s.as_str())
297+
.unwrap_or("_")
298+
}
299+
271300
pub fn identifier_include_digits(&self) -> bool {
272301
self.identifier_include_digits.unwrap_or(true)
273302
}
@@ -293,6 +322,14 @@ impl FileSource for FileConfig {
293322
self.ignore_hex
294323
}
295324

325+
fn identifier_leading_digits(&self) -> Option<bool> {
326+
self.identifier_leading_digits
327+
}
328+
329+
fn identifier_leading_chars(&self) -> Option<&str> {
330+
self.identifier_leading_chars.as_ref().map(|s| s.as_str())
331+
}
332+
296333
fn identifier_include_digits(&self) -> Option<bool> {
297334
self.identifier_include_digits
298335
}

src/main.rs

+2
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,8 @@ fn run() -> Result<i32, anyhow::Error> {
412412

413413
let parser = typos::tokens::ParserBuilder::new()
414414
.ignore_hex(config.default.ignore_hex())
415+
.leading_digits(config.default.identifier_leading_digits())
416+
.leading_chars(config.default.identifier_leading_chars().to_owned())
415417
.include_digits(config.default.identifier_include_digits())
416418
.include_chars(config.default.identifier_include_chars().to_owned())
417419
.build();

typos/src/tokens.rs

+46-17
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ pub enum Case {
99
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1010
pub struct ParserBuilder {
1111
ignore_hex: bool,
12+
leading_digits: bool,
13+
leading_chars: String,
1214
include_digits: bool,
1315
include_chars: String,
1416
}
@@ -23,6 +25,16 @@ impl ParserBuilder {
2325
self
2426
}
2527

28+
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
29+
self.leading_digits = yes;
30+
self
31+
}
32+
33+
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
34+
self.leading_chars = chars;
35+
self
36+
}
37+
2638
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
2739
self.include_digits = yes;
2840
self
@@ -34,31 +46,44 @@ impl ParserBuilder {
3446
}
3547

3648
pub fn build(&self) -> Parser {
37-
let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned();
38-
if self.include_digits {
39-
pattern.push_str(r#"|\d"#);
40-
}
41-
for grapheme in
42-
unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
43-
{
44-
let escaped = regex::escape(&grapheme);
45-
pattern.push_str(&format!("|{}", escaped));
46-
}
47-
pattern.push_str(r#")+\b"#);
49+
let mut pattern = r#"\b("#.to_owned();
50+
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
51+
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
52+
pattern.push_str(r#"*)\b"#);
53+
let pattern = dbg!(pattern);
54+
4855
let words_str = regex::Regex::new(&pattern).unwrap();
4956
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
57+
5058
Parser {
5159
words_str,
5260
words_bytes,
53-
ignore_hex: self.ignore_hex && self.include_digits,
61+
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
62+
// hexadecimal number without a leading digit.
63+
ignore_numbers: self.leading_digits,
64+
ignore_hex: self.ignore_hex && self.leading_digits,
5465
}
5566
}
67+
68+
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
69+
pattern.push_str(r#"(\p{Alphabetic}"#);
70+
if digits {
71+
pattern.push_str(r#"|\d"#);
72+
}
73+
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
74+
let escaped = regex::escape(&grapheme);
75+
pattern.push_str(&format!("|{}", escaped));
76+
}
77+
pattern.push_str(r#")"#);
78+
}
5679
}
5780

5881
impl Default for ParserBuilder {
5982
fn default() -> Self {
6083
Self {
6184
ignore_hex: true,
85+
leading_digits: false,
86+
leading_chars: "_".to_owned(),
6287
include_digits: true,
6388
include_chars: "_'".to_owned(),
6489
}
@@ -69,6 +94,7 @@ impl Default for ParserBuilder {
6994
pub struct Parser {
7095
words_str: regex::Regex,
7196
words_bytes: regex::bytes::Regex,
97+
ignore_numbers: bool,
7298
ignore_hex: bool,
7399
}
74100

@@ -95,12 +121,12 @@ impl Parser {
95121
}
96122

97123
fn accept(&self, contents: &[u8]) -> bool {
98-
if is_number(contents) {
124+
if self.ignore_numbers && is_number(contents) {
99125
return false;
100-
};
126+
}
101127

102-
if self.ignore_hex {
103-
return !is_hex(contents);
128+
if self.ignore_hex && is_hex(contents) {
129+
return false;
104130
}
105131

106132
true
@@ -455,7 +481,10 @@ mod test {
455481

456482
#[test]
457483
fn tokenize_ignore_hex_disabled() {
458-
let parser = ParserBuilder::new().ignore_hex(false).build();
484+
let parser = ParserBuilder::new()
485+
.ignore_hex(false)
486+
.leading_digits(true)
487+
.build();
459488

460489
let input = "Hello 0xDEADBEEF World";
461490
let expected: Vec<Identifier> = vec![

0 commit comments

Comments
 (0)