refactor: new SequenceExpr and CharStringExt methods (#2269)

* refactor: new `SequenceExpr` and `CharStringExt` methods

* refactor: propagate new methods
This commit is contained in:
Andrew Dunbar
2025-12-02 16:14:33 +00:00
committed by GitHub
parent 30ff6e1f22
commit 70d5861b94
22 changed files with 165 additions and 126 deletions

View File

@@ -46,6 +46,10 @@ pub trait CharStringExt {
/// The suffix is assumed to be lowercase.
fn ends_with_ignore_ascii_case_str(&self, suffix: &str) -> bool;
/// Case-insensitive check if the string ends with any of the given ASCII suffixes.
/// The suffixes are assumed to be lowercase.
fn ends_with_any_ignore_ascii_case_chars(&self, suffixes: &[&[char]]) -> bool;
/// Check if the string contains any vowels
fn contains_vowel(&self) -> bool;
}
@@ -148,6 +152,12 @@ impl CharStringExt for [char] {
.all(|(a, b)| a.to_ascii_lowercase() == *b)
}
fn ends_with_any_ignore_ascii_case_chars(&self, suffixes: &[&[char]]) -> bool {
suffixes
.iter()
.any(|suffix| self.ends_with_ignore_ascii_case_chars(suffix))
}
fn contains_vowel(&self) -> bool {
self.iter().any(|c| c.is_vowel())
}

View File

@@ -42,17 +42,13 @@ impl FixedPhrase {
phrase = phrase.then_whitespace();
}
TokenKind::Punctuation(p) => {
phrase = phrase.then(move |t: &Token, _source: &[char]| {
t.kind.as_punctuation().cloned() == Some(p)
})
phrase = phrase
.then_kind_where(move |kind| kind.as_punctuation().cloned() == Some(p));
}
TokenKind::ParagraphBreak => {
phrase = phrase.then_whitespace();
}
TokenKind::Number(n) => {
phrase = phrase
.then(move |tok: &Token, _source: &[char]| tok.kind == TokenKind::Number(n))
}
TokenKind::Number(_) => phrase = phrase.then_kind_where(|kind| kind.is_number()),
_ => panic!("Fell out of expected document formats."),
}
}

View File

@@ -19,8 +19,8 @@ macro_rules! gen_then_from_is {
paste! {
#[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
pub fn [< then_$quality >] (self) -> Self{
self.then(|tok: &Token, _source: &[char]| {
tok.kind.[< is_$quality >]()
self.then_kind_where(|kind| {
kind.[< is_$quality >]()
})
}
@@ -40,12 +40,8 @@ macro_rules! gen_then_from_is {
#[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns false.")]
pub fn [< then_anything_but_$quality >] (self) -> Self{
self.then(|tok: &Token, _source: &[char]| {
if tok.kind.[< is_$quality >](){
false
}else{
true
}
self.then_kind_where(|kind| {
!kind.[< is_$quality >]()
})
}
}
@@ -170,11 +166,6 @@ impl SequenceExpr {
self.then(WordSet::new(words))
}
/// Matches any token whose `Kind` exactly matches.
pub fn then_strict(self, kind: TokenKind) -> Self {
self.then(move |tok: &Token, _source: &[char]| tok.kind == kind)
}
/// Match against one or more whitespace tokens.
pub fn then_whitespace(self) -> Self {
self.then(WhitespacePattern)
@@ -229,7 +220,7 @@ impl SequenceExpr {
/// Matches any word.
pub fn then_any_word(self) -> Self {
self.then(|tok: &Token, _source: &[char]| tok.kind.is_word())
self.then_kind_where(|kind| kind.is_word())
}
/// Match examples of `word` that have any capitalization.
@@ -266,6 +257,23 @@ impl SequenceExpr {
// One kind
/// Matches any token whose `Kind` exactly matches.
pub fn then_kind(self, kind: TokenKind) -> Self {
self.then_kind_where(move |k| kind == *k)
}
/// Matches a token where the provided closure returns true for the token's kind.
pub fn then_kind_where<F>(mut self, predicate: F) -> Self
where
F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.exprs
.push(Box::new(move |tok: &Token, _source: &[char]| {
predicate(&tok.kind)
}));
self
}
/// Match a token of a given kind which is not in the list of words.
pub fn then_kind_except<F>(self, pred_is: F, ex: &'static [&'static str]) -> Self
where
@@ -288,7 +296,7 @@ impl SequenceExpr {
F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then(move |tok: &Token, _source: &[char]| pred_is_1(&tok.kind) && pred_is_2(&tok.kind))
self.then_kind_where(move |k| pred_is_1(k) && pred_is_2(k))
}
/// Match a token where either of the two token kind predicates returns true.
@@ -298,7 +306,17 @@ impl SequenceExpr {
F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then(move |tok: &Token, _source: &[char]| pred_is_1(&tok.kind) || pred_is_2(&tok.kind))
self.then_kind_where(move |k| pred_is_1(k) || pred_is_2(k))
}
/// Match a token where neither of the two token kind predicates returns true.
/// For instance, a word that can't be a verb or a noun.
pub fn then_kind_neither<F1, F2>(self, pred_isnt_1: F1, pred_isnt_2: F2) -> Self
where
F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then_kind_where(move |k| !pred_isnt_1(k) && !pred_isnt_2(k))
}
/// Match a token where the first token kind predicate returns true and the second returns false.
@@ -308,7 +326,21 @@ impl SequenceExpr {
F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then(move |tok: &Token, _source: &[char]| pred_is(&tok.kind) && !pred_not(&tok.kind))
self.then_kind_where(move |k| pred_is(k) && !pred_not(k))
}
/// Match a token where the first token kind predicate returns true and all of the second return false.
/// For instance, a word that can be a verb but not a noun or an adjective.
pub fn then_kind_is_but_isnt_any_of<F1, F2>(
self,
pred_is: F1,
preds_isnt: &'static [F2],
) -> Self
where
F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then_kind_where(move |k| pred_is(k) && !preds_isnt.iter().any(|pred| pred(k)))
}
/// Match a token where the first token kind predicate returns true and the second returns false,
@@ -341,7 +373,16 @@ impl SequenceExpr {
where
F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then(move |tok: &Token, _source: &[char]| preds_is.iter().any(|pred| pred(&tok.kind)))
self.then_kind_where(move |k| preds_is.iter().any(|pred| pred(k)))
}
/// Match a token where none of the token kind predicates returns true.
/// Like `then_kind_neither` but for more than two predicates.
pub fn then_kind_none_of<F>(self, preds_isnt: &'static [F]) -> Self
where
F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
{
self.then_kind_where(move |k| preds_isnt.iter().all(|pred| !pred(k)))
}
/// Match a token where any of the token kind predicates returns true,

View File

@@ -12,11 +12,13 @@ pub struct AdjectiveDoubleDegree {
impl Default for AdjectiveDoubleDegree {
fn default() -> Self {
Self {
expr: Box::new(SequenceExpr::word_set(&["more", "most"]).t_ws().then(
|tok: &Token, _src: &[char]| {
tok.kind.is_comparative_adjective() || tok.kind.is_superlative_adjective()
},
)),
expr: Box::new(
SequenceExpr::word_set(&["more", "most"])
.t_ws()
.then_kind_where(|kind| {
kind.is_comparative_adjective() || kind.is_superlative_adjective()
}),
),
}
}
}

View File

@@ -12,7 +12,7 @@ pub struct DeterminerWithoutNoun {
impl Default for DeterminerWithoutNoun {
fn default() -> Self {
let expr = SequenceExpr::default()
.then(|tok: &Token, _: &[char]| tok.kind.is_determiner())
.then_kind_where(|kind| kind.is_determiner())
.t_ws()
.then_conjunction();

View File

@@ -15,22 +15,22 @@ impl Default for Everyday {
let everyday = Word::new("everyday");
let every_day = Lrc::new(SequenceExpr::aco("every").t_ws().t_aco("day"));
let everyday_bad_after =
All::new(vec![
Box::new(
SequenceExpr::default()
.then(everyday.clone())
.t_ws()
.then_any_word(),
),
Box::new(SequenceExpr::default().t_any().t_any().then(
|tok: &Token, _src: &[char]| {
!tok.kind.is_noun()
&& !tok.kind.is_oov()
&& !tok.kind.is_verb_progressive_form()
},
)),
]);
let everyday_bad_after = All::new(vec![
Box::new(
SequenceExpr::default()
.then(everyday.clone())
.t_ws()
.then_any_word(),
),
Box::new(
SequenceExpr::default()
.t_any()
.t_any()
.then_kind_where(|kind| {
!kind.is_noun() && !kind.is_oov() && !kind.is_verb_progressive_form()
}),
),
]);
let bad_before_every_day = All::new(vec![
Box::new(
@@ -72,18 +72,14 @@ impl Default for Everyday {
.then(everyday.clone())
.then_punctuation(),
),
Box::new(
SequenceExpr::default()
.t_any()
.then(|tok: &Token, _src: &[char]| {
matches!(
tok.kind,
TokenKind::Punctuation(
Punctuation::Question | Punctuation::Comma | Punctuation::Period
)
)
}),
),
Box::new(SequenceExpr::default().t_any().then_kind_where(|kind| {
matches!(
kind,
TokenKind::Punctuation(
Punctuation::Question | Punctuation::Comma | Punctuation::Period
)
)
})),
]);
// (However, the message goes far beyond) every day things.
@@ -102,9 +98,9 @@ impl Default for Everyday {
.t_any()
.t_any()
.t_any()
.then(|tok: &Token, _src: &[char]| {
.then_kind_where(|kind| {
matches!(
tok.kind,
kind,
TokenKind::Punctuation(
Punctuation::Question | Punctuation::Comma | Punctuation::Period
)

View File

@@ -18,9 +18,7 @@ impl Default for IAmAgreement {
.then(i_are.clone());
let non_and_word_before_i_are = SequenceExpr::default()
.then(|tok: &Token, src: &[char]| {
!tok.kind.is_word() || tok.span.get_content_string(src).to_lowercase() != "and"
})
.then_word_except(&["and"])
.t_ws()
.then(i_are);

View File

@@ -15,23 +15,23 @@ impl Default for ItLooksLikeThat {
SequenceExpr::default()
.then_fixed_phrase("it looks like that")
.then_whitespace()
.then(|tok: &Token, _: &[char]| {
.then_kind_where(|kind| {
// Heuristics on the word after "that" which show "that" was used
// as a relative pronoun, which is a mistake
let is_subj = tok.kind.is_subject_pronoun();
let is_ing = tok.kind.is_verb_progressive_form();
let is_subj = kind.is_subject_pronoun();
let is_ing = kind.is_verb_progressive_form();
let is_definitely_rel_pron = is_subj || is_ing;
// Heuristics on the word after "that" which show "that"
// could possibly be a legitimate demonstrative pronoun or determiner
// as a demonstrative pronoun or a determiner
// which would not be a mistake.
let is_v3psgpres = tok.kind.is_verb_third_person_singular_present_form();
let is_v3psgpres = kind.is_verb_third_person_singular_present_form();
// NOTE: we don't have .is_modal_verb() but maybe we need it now!
let is_vmodal_or_aux = tok.kind.is_auxiliary_verb();
let is_vpret = tok.kind.is_verb_simple_past_form();
let is_noun = tok.kind.is_noun();
let is_oov = tok.kind.is_oov();
let is_vmodal_or_aux = kind.is_auxiliary_verb();
let is_vpret = kind.is_verb_simple_past_form();
let is_noun = kind.is_noun();
let is_oov = kind.is_oov();
let maybe_demonstrative_or_determiner =
is_v3psgpres || is_vmodal_or_aux || is_vpret || is_noun || is_oov;

View File

@@ -27,13 +27,14 @@ impl Default for NoContractionWithVerb {
.then(WordSet::new(&["lets", "let"]))
.then_whitespace();
// Match verbs that are only verbs (not also nouns/adjectives) and not in -ing form
let non_ing_verb = SequenceExpr::default().then(|tok: &Token, _src: &[char]| {
tok.kind.is_verb()
&& !tok.kind.is_noun()
&& !tok.kind.is_adjective()
&& !tok.kind.is_verb_progressive_form()
});
let non_ing_verb = SequenceExpr::default().then_kind_is_but_isnt_any_of(
TokenKind::is_verb,
&[
TokenKind::is_noun,
TokenKind::is_adjective,
TokenKind::is_verb_progressive_form,
] as &[_],
);
// Ambiguous word is a verb determined by heuristic of following word's part of speech
// Tests the next two words after "let".

View File

@@ -19,9 +19,10 @@ where
{
pub fn new(dict: D) -> Self {
let oov = SequenceExpr::default().then_oov();
let looks_plural = SequenceExpr::default().then(|tok: &Token, _src: &[char]| {
let lchars = tok.span.get_content(_src).to_lower();
lchars.last().is_some_and(|c| *c == 's')
let looks_plural = SequenceExpr::default().then(|tok: &Token, src: &[char]| {
tok.span
.get_content(src)
.ends_with_ignore_ascii_case_chars(&['s'])
});
let oov_looks_plural = All::new(vec![Box::new(oov), Box::new(looks_plural)]);

View File

@@ -203,7 +203,7 @@ impl Default for MissingTo {
let pattern = SequenceExpr::default()
.then(Self::controller_words())
.t_ws()
.then(|tok: &Token, _source: &[char]| tok.kind.is_verb_lemma());
.then_kind_where(|kind| kind.is_verb_lemma());
map.insert(pattern, 0);

View File

@@ -29,15 +29,15 @@ impl ModalSeem {
fn adjective_step() -> SequenceExpr {
SequenceExpr::default()
.t_ws()
.then(|tok: &Token, _source: &[char]| tok.kind.is_adjective())
.then_kind_where(|kind| kind.is_adjective())
}
fn adverb_then_adjective_step() -> SequenceExpr {
SequenceExpr::default()
.t_ws()
.then(|tok: &Token, _source: &[char]| tok.kind.is_adverb())
.then_kind_where(|kind| kind.is_adverb())
.t_ws()
.then(|tok: &Token, _source: &[char]| tok.kind.is_adjective())
.then_kind_where(|kind| kind.is_adjective())
}
}

View File

@@ -54,8 +54,8 @@ impl Default for Months {
"by", "during", "in", "last", "next", "of", "until",
]);
let year_or_day_of_month = SequenceExpr::default().then(|tok: &Token, _src: &[char]| {
if let TokenKind::Number(number) = &tok.kind {
let year_or_day_of_month = SequenceExpr::default().then_kind_where(|kind| {
if let TokenKind::Number(number) = &kind {
let v = number.value.into_inner() as u32;
(1500..=2500).contains(&v) || (1..=31).contains(&v)
} else {

View File

@@ -30,12 +30,12 @@ impl Default for NeedToNoun {
.then_word_set(&["be"]);
let a = SequenceExpr::default()
.then(|tok: &Token, _: &[char]| tok.kind.is_nominal())
.then_kind_where(|kind| kind.is_nominal())
.t_ws()
.then_unless(postfix_exceptions);
let b = SequenceExpr::default()
.then(|tok: &Token, _: &[char]| tok.kind.is_nominal() && !tok.kind.is_verb());
let b =
SequenceExpr::default().then_kind_where(|kind| kind.is_nominal() && !kind.is_verb());
let expr = SequenceExpr::default()
.then(DerivedFrom::new_from_str("need"))

View File

@@ -57,7 +57,7 @@ impl Default for AffectToEffect {
.then(|tok: &Token, source: &[char]| matches_preceding_context(tok, source))
.t_ws()
.then(|tok: &Token, source: &[char]| is_affect_word(tok, source))
.then(|tok: &Token, _source: &[char]| matches!(tok.kind, TokenKind::Punctuation(_)));
.then_kind_where(|kind| kind.is_punctuation());
map.insert(punctuation_follow, 2);

View File

@@ -1,3 +1,4 @@
use crate::TokenKind;
use crate::expr::Expr;
use crate::expr::SequenceExpr;
use crate::{CharString, CharStringExt, Token, char_string::char_string, patterns::WordSet};
@@ -16,9 +17,10 @@ impl Default for PiqueInterest {
"peak", "peaked", "peek", "peeked", "peeking", "peaking",
]))
.then_whitespace()
.then(|tok: &Token, _: &[char]| {
tok.kind.is_non_plural_nominal() || tok.kind.is_possessive_determiner()
})
.then_kind_either(
TokenKind::is_non_plural_nominal,
TokenKind::is_possessive_determiner,
)
.then_whitespace()
.t_aco("interest");

View File

@@ -13,12 +13,12 @@ pub struct PronounAre {
impl Default for PronounAre {
fn default() -> Self {
let expr = SequenceExpr::default()
.then(|tok: &Token, _src: &[char]| {
tok.kind.is_pronoun()
&& tok.kind.is_subject_pronoun()
&& (tok.kind.is_second_person_pronoun()
|| tok.kind.is_first_person_plural_pronoun()
|| tok.kind.is_third_person_plural_pronoun())
.then_kind_where(|kind| {
kind.is_pronoun()
&& kind.is_subject_pronoun()
&& (kind.is_second_person_pronoun()
|| kind.is_first_person_plural_pronoun()
|| kind.is_third_person_plural_pronoun())
})
.t_ws()
.t_aco("r");

View File

@@ -52,9 +52,8 @@ impl PronounInflectionBe {
map.insert(arent, "isn't");
let is = SequenceExpr::default()
.then(|tok: &Token, _: &[char]| {
tok.kind
.as_word()
.then_kind_where(|kind| {
kind.as_word()
.as_ref()
.and_then(|m| m.as_ref().and_then(|m| m.np_member))
.unwrap_or_default()

View File

@@ -27,9 +27,9 @@ impl Default for QuiteQuiet {
if !tok.kind.is_verb() || !tok.kind.is_apostrophized() {
return false;
}
let chars = tok.span.get_content(src);
chars.ends_with_ignore_ascii_case_chars(&['n', '\'', 't'])
|| chars.ends_with_ignore_ascii_case_chars(&['n', '', 't'])
tok.span
.get_content(src)
.ends_with_any_ignore_ascii_case_chars(&[&['n', '\'', 't'], &['n', '', 't']])
})
.t_ws()
.t_aco("quiet");

View File

@@ -1,6 +1,6 @@
use crate::linting::expr_linter::Chunk;
use crate::{
CharStringExt, Token,
Token, TokenKind,
expr::SequenceExpr,
linting::{ExprLinter, Lint, LintKind, Suggestion},
};
@@ -11,14 +11,10 @@ pub struct Theres {
impl Default for Theres {
fn default() -> Self {
let expr = SequenceExpr::aco("their's")
.t_ws()
.then(|tok: &Token, src: &[char]| {
tok.kind.is_determiner()
|| tok.kind.is_quantifier()
|| tok.span.get_content(src).eq_ignore_ascii_case_str("no")
|| tok.span.get_content(src).eq_ignore_ascii_case_str("enough")
});
let expr = SequenceExpr::aco("their's").t_ws().then_kind_any_or_words(
&[TokenKind::is_determiner, TokenKind::is_quantifier] as &[_],
&["no", "enough"],
);
Self {
expr: Box::new(expr),

View File

@@ -23,10 +23,7 @@ impl Default for ToTooAdjVerbEdPunct {
&& tok
.span
.get_content(src)
.iter()
.collect::<String>()
.to_lowercase()
.ends_with("ed")
.ends_with_ignore_ascii_case_chars(&['e', 'd'])
})
.then_sentence_terminator();

View File

@@ -19,12 +19,12 @@ impl Default for VerbToAdjective {
let expr = SequenceExpr::default()
.then(WordSet::new(&["the", "a", "an"]))
.t_ws()
.then(|tok: &Token, _: &[char]| {
(tok.kind.is_verb()
&& !tok.kind.is_verb_past_form()
&& !tok.kind.is_adjective()
&& !tok.kind.is_noun())
|| tok.kind.is_degree_adverb()
.then_kind_where(|kind| {
(kind.is_verb()
&& !kind.is_verb_past_form()
&& !kind.is_adjective()
&& !kind.is_noun())
|| kind.is_degree_adverb()
})
.t_ws()
.then(UPOSSet::new(&[UPOS::NOUN, UPOS::PROPN]));