feat(chunker): build new chunker with Burn (#1579)

This commit is contained in:
Elijah Potter
2025-07-31 11:49:46 -06:00
committed by GitHub
parent bdc6b1b16b
commit 7f10ac6055
37 changed files with 45571 additions and 9374 deletions

2731
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,4 +18,4 @@ opt-level = 3
# Useful for debugging and profiling.
[profile.release-debug]
inherits = "release"
debug = 2
debug = 2

View File

@@ -11,7 +11,7 @@ RUN cargo install wasm-pack
COPY . .
WORKDIR /usr/build/harper-wasm
RUN wasm-pack build --release --target web
RUN RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack build --target web
FROM node:${NODE_VERSION} AS node-build

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,7 @@
use harper_pos_utils::{BurnChunkerCpu, CachedChunker};
use lazy_static::lazy_static;
use std::num::NonZero;
use std::rc::Rc;
use std::sync::Arc;
pub use harper_pos_utils::{BrillChunker, BrillTagger, Chunker, FreqDict, Tagger, UPOS};
@@ -30,3 +33,21 @@ fn uncached_brill_chunker() -> BrillChunker {
pub fn brill_chunker() -> Arc<BrillChunker> {
(*BRILL_CHUNKER).clone()
}
const BURN_CHUNKER_VOCAB: &[u8; 627993] = include_bytes!("../finished_chunker/vocab.json");
const BURN_CHUNKER_BIN: &[u8; 806312] = include_bytes!("../finished_chunker/model.mpk");
thread_local! {
static BURN_CHUNKER: Rc<CachedChunker<BurnChunkerCpu>> = Rc::new(uncached_burn_chunker());
}
fn uncached_burn_chunker() -> CachedChunker<BurnChunkerCpu> {
CachedChunker::new(
BurnChunkerCpu::load_from_bytes_cpu(BURN_CHUNKER_BIN, BURN_CHUNKER_VOCAB, 6, 0.3),
NonZero::new(10000).unwrap(),
)
}
pub fn burn_chunker() -> Rc<CachedChunker<BurnChunkerCpu>> {
(BURN_CHUNKER).with(|c| c.clone())
}

View File

@@ -24,3 +24,4 @@ strum_macros = "0.27.2"
[features]
default = []
training = ["harper-pos-utils/training"]

View File

@@ -21,7 +21,9 @@ use harper_core::{
word_metadata_orthography::OrthFlags,
};
use harper_literate_haskell::LiterateHaskellParser;
use harper_pos_utils::{BrillChunker, BrillTagger};
#[cfg(feature = "training")]
use harper_pos_utils::{BrillChunker, BrillTagger, BurnChunkerCpu};
use harper_stats::Stats;
use serde::Serialize;
@@ -101,6 +103,7 @@ enum Args {
/// The document to mine words from.
file: PathBuf,
},
#[cfg(feature = "training")]
TrainBrillTagger {
#[arg(short, long, default_value = "1.0")]
candidate_selection_chance: f32,
@@ -112,6 +115,7 @@ enum Args {
#[arg(num_args = 1..)]
datasets: Vec<PathBuf>,
},
#[cfg(feature = "training")]
TrainBrillChunker {
#[arg(short, long, default_value = "1.0")]
candidate_selection_chance: f32,
@@ -123,6 +127,27 @@ enum Args {
#[arg(num_args = 1..)]
datasets: Vec<PathBuf>,
},
#[cfg(feature = "training")]
TrainBurnChunker {
#[arg(short, long)]
lr: f64,
// The number of embedding dimensions
#[arg(long)]
dim: usize,
/// The path to write the final model file to.
#[arg(short, long)]
output: PathBuf,
/// The number of epochs to train.
#[arg(short, long)]
epochs: usize,
/// The dropout probability
#[arg(long)]
dropout: f32,
#[arg(short, long)]
test_file: PathBuf,
#[arg(num_args = 1..)]
datasets: Vec<PathBuf>,
},
/// Print harper-core version.
CoreVersion,
/// Rename a flag in the dictionary and affixes.
@@ -476,6 +501,7 @@ fn main() -> anyhow::Result<()> {
println!("harper-core v{}", harper_core::core_version());
Ok(())
}
#[cfg(feature = "training")]
Args::TrainBrillTagger {
datasets: dataset,
epochs,
@@ -487,6 +513,7 @@ fn main() -> anyhow::Result<()> {
Ok(())
}
#[cfg(feature = "training")]
Args::TrainBrillChunker {
datasets,
epochs,
@@ -497,6 +524,22 @@ fn main() -> anyhow::Result<()> {
fs::write(output, serde_json::to_string_pretty(&chunker)?)?;
Ok(())
}
#[cfg(feature = "training")]
Args::TrainBurnChunker {
datasets,
test_file,
epochs,
dropout,
output,
lr,
dim: embed_dim,
} => {
let chunker =
BurnChunkerCpu::train_cpu(&datasets, &test_file, embed_dim, dropout, epochs, lr);
chunker.save_to(output);
Ok(())
}
Args::RenameFlag { old, new, dir } => {
use serde_json::Value;

View File

@@ -2,7 +2,7 @@ use std::cmp::Ordering;
use std::collections::VecDeque;
use std::fmt::Display;
use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
use paste::paste;
use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
@@ -140,33 +140,37 @@ impl Document {
self.condense_filename_extensions();
self.match_quotes();
let token_strings: Vec<_> = self
.tokens
.iter()
.filter(|t| !t.kind.is_whitespace())
.map(|t| self.get_span_content_str(&t.span))
.collect();
let chunker = burn_chunker();
let tagger = brill_tagger();
let token_tags = brill_tagger().tag_sentence(&token_strings);
let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
for sent in self.tokens.iter_sentences_mut() {
let token_strings: Vec<_> = sent
.iter()
.filter(|t| !t.kind.is_whitespace())
.map(|t| t.span.get_content_string(&self.source))
.collect();
let mut i = 0;
let token_tags = tagger.tag_sentence(&token_strings);
let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
// Annotate word metadata
for token in self.tokens.iter_mut() {
if let TokenKind::Word(meta) = &mut token.kind {
let word_source = token.span.get_content(&self.source);
let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
let mut i = 0;
if let Some(inner) = &mut found_meta {
inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
inner.np_member = Some(np_flags[i]);
// Annotate word metadata
for token in sent.iter_mut() {
if let TokenKind::Word(meta) = &mut token.kind {
let word_source = token.span.get_content(&self.source);
let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
if let Some(inner) = &mut found_meta {
inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
inner.np_member = Some(np_flags[i]);
}
*meta = found_meta;
i += 1;
} else if !token.kind.is_whitespace() {
i += 1;
}
*meta = found_meta;
i += 1;
} else if !token.kind.is_whitespace() {
i += 1;
}
}
}
@@ -745,6 +749,10 @@ impl TokenStringExt for Document {
fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
self.tokens.iter_sentences()
}
fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
self.tokens.iter_sentences_mut()
}
}
impl Display for Document {

View File

@@ -99,6 +99,10 @@ pub trait TokenStringExt {
/// Get an iterator over token slices that represent the individual
/// sentences in a document.
fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
/// Get an iterator over mutable token slices that represent the individual
/// sentences in a document.
fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
}
impl TokenStringExt for [Token] {
@@ -239,4 +243,32 @@ impl TokenStringExt for [Token] {
first_sentence.into_iter().chain(rest).chain(last_sentence)
}
fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
struct SentIter<'a> {
rem: &'a mut [Token],
}
impl<'a> Iterator for SentIter<'a> {
type Item = &'a mut [Token];
fn next(&mut self) -> Option<Self::Item> {
if self.rem.is_empty() {
return None;
}
let split = self
.rem
.iter()
.position(|t| t.kind.is_sentence_terminator())
.map(|i| i + 1)
.unwrap_or(self.rem.len());
let tmp = core::mem::take(&mut self.rem);
let (sent, rest) = tmp.split_at_mut(split);
self.rem = rest;
Some(sent)
}
}
SentIter { rem: self }
}
}

View File

@@ -4,7 +4,7 @@ use itertools::Itertools;
use paste::paste;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use strum::{EnumCount, VariantArray};
use strum::{EnumCount as _, VariantArray as _};
use strum_macros::{Display, EnumCount, EnumString, VariantArray};
use std::convert::TryFrom;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -6,68 +6,68 @@
# Unlintable Unlintable
> -->
# Unlintable Unlintable
> Part - of - speech tagging
# Unlintable NSg/V/J . P . N🅪Sg/V+ NSg/V
> Part - of - speech tagging
# Unlintable NSg/V/J+ . P . N🅪Sg/V+ NSg/V
>
#
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
# NPr/J/P NSg+ NᴹSg . NSg/V/J . P . N🅪Sg/V NSg/V . NSg+ NSg/V NPr/C NSg+ NSg/V NPr/C
> POST ) , also called grammatical tagging is the process of marking up a word in a
# NPr🅪/V/P+ . . W? V/J J NSg/V VL D NSg/V P NSg/V NSg/V/J/P D/P NSg/V NPr/J/P D/P
> text ( corpus ) as corresponding to a particular part of speech , based on both its
# N🅪Sg/V . NSg+ . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P N🅪Sg/V+ . V/J J/P I/C/Dq ISg/D$+
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
# NPr/J/P NSg+ NᴹSg+ . NSg/V/J+ . P . N🅪Sg/V+ NSg/V . NSg+ NSg/V NPr/C NSg+ NSg/V NPr/C
> POST ) , also called grammatical tagging is the process of marking up a word in a
# NPr🅪/V/P+ . . W? V/J J NSg/V VL D NSg/V P NSg/V NSg/V/J/P D/P NSg/V+ NPr/J/P D/P
> text ( corpus ) as corresponding to a particular part of speech , based on both its
# N🅪Sg/V+ . NSg+ . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P N🅪Sg/V+ . V/J J/P I/C/Dq ISg/D$+
> definition and its context . A simplified form of this is commonly taught to
# NSg V/C ISg/D$+ N🅪Sg/V+ . D/P V/J NSg/V P I/Ddem+ VL R V P
> school - age children , in the identification of words as nouns , verbs , adjectives ,
# NSg/V . N🅪Sg/V NPl . NPr/J/P D NSg P NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V .
> school - age children , in the identification of words as nouns , verbs , adjectives ,
# NSg/V . N🅪Sg/V+ NPl+ . NPr/J/P D NSg P NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V .
> adverbs , etc.
# NPl/V . W?
# NPl/V . +
>
#
> Once performed by hand , POS tagging is now done in the context of computational
# NSg/C V/J NSg/J/P NSg/V+ . NSg+ NSg/V VL NPr/V/J/C NSg/V/J NPr/J/P D N🅪Sg/V P J+
# NSg/C V/J NSg/J/P NSg/V+ . NSg+ NSg/V VL NPr/V/J/C NSg/V/J NPr/J/P D N🅪Sg/V P J
> linguistics , using algorithms which associate discrete terms , as well as hidden
# NᴹSg+ . V NPl+ I/C+ NSg/V/J+ J NPl/V+ . NSg/R NSg/V/J NSg/R V/J
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
# NPl/V P N🅪Sg/V+ . NSg/J/P D/P NPr/V/J P NSg/J+ NPl/V+ . NSg+ . NSg/V NPl NSg/V P
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
# NSg NSg/J NPl/V+ . NSg/V+ . V/J+ V/C+ J+ . ? ? NSg . NSg/I/V/J P D
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
# NPl/V P N🅪Sg/V+ . NSg/J/P D/P NPr/V/J P NSg/J NPl/V+ . NSg+ . NSg/V NPl+ NSg/V+ P
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
# NSg NSg/J NPl/V+ . NSg/V+ . V/J V/C J . ? ? NSg . NSg/I/V/J P D
> first and most widely used English POS - taggers , employs rule - based algorithms .
# NSg/V/J V/C NSg/I/J/Dq R V/J NPr🅪/V/J+ NSg+ . NPl . NPl/V NSg/V+ . V/J NPl+ .
>
#
> Principle
# N🅪Sg/V
# N🅪Sg/V+
>
#
> Part - of - speech tagging is harder than just having a list of words and their
# NSg/V/J . P . N🅪Sg/V NSg/V VL JC C/P V/J V D/P NSg/V P NPl/V V/C D$+
> parts of speech , because some words can represent more than one part of speech
# NPl/V P N🅪Sg/V+ . C/P I/J/R/Dq+ NPl/V+ NPr/VX V NPr/I/V/J/Dq C/P NSg/I/V/J NSg/V/J P N🅪Sg/V+
> at different times , and because some parts of speech are complex . This is not
# NSg/P NSg/J+ NPl/V+ . V/C C/P I/J/R/Dq NPl/V P N🅪Sg/V+ V+ NSg/V/J+ . I/Ddem+ VL NSg/C
> rare — in natural languages ( as opposed to many artificial languages ) , a large
# NSg/V/J . NPr/J/P NSg/J NPl/V+ . NSg/R V/J P NSg/I/J/Dq J NPl/V+ . . D/P NSg/J
> Part - of - speech tagging is harder than just having a list of words and their
# NSg/V/J+ . P . N🅪Sg/V+ NSg/V VL JC C/P V/J V D/P NSg/V P NPl/V+ V/C D$+
> parts of speech , because some words can represent more than one part of speech
# NPl/V P N🅪Sg/V+ . C/P I/J/R/Dq NPl/V+ NPr/VX V NPr/I/V/J/Dq C/P NSg/I/V/J NSg/V/J P N🅪Sg/V+
> at different times , and because some parts of speech are complex . This is not
# NSg/P NSg/J NPl/V+ . V/C C/P I/J/R/Dq NPl/V P N🅪Sg/V+ V NSg/V/J . I/Ddem+ VL NSg/C
> rare — in natural languages ( as opposed to many artificial languages ) , a large
# NSg/V/J . NPr/J/P NSg/J+ NPl/V+ . NSg/R V/J P NSg/I/J/Dq+ J+ NPl/V+ . . D/P NSg/J
> percentage of word - forms are ambiguous . For example , even " dogs " , which is
# NSg P NSg/V+ . NPl/V+ V+ J+ . C/P NSg/V+ . NSg/V/J . NPl/V+ . . I/C+ VL
> usually thought of as just a plural noun , can also be a verb :
# R NSg/V P NSg/R V/J D/P+ NSg/J+ NSg/V+ . NPr/VX W? NSg/VX D/P NSg/V+ .
# NSg P NSg/V+ . NPl/V+ V J . C/P NSg/V+ . NSg/V/J . NPl/V+ . . I/C+ VL
> usually thought of as just a plural noun , can also be a verb :
# R NSg/V P NSg/R V/J D/P+ NSg/J+ NSg/V+ . NPr/VX W? NSg/VX D/P+ NSg/V+ .
>
#
> The sailor dogs the hatch .
# D+ NSg NPl/V D NSg/V+ .
> The sailor dogs the hatch .
# D+ NSg+ NPl/V+ D+ NSg/V+ .
>
#
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
# NSg/V/J+ J NSg/V NPr/VX V NSg/I/C/Ddem+ . NPl/V+ . VL NSg/J/R V/J NSg/R D/P+ NSg/V+ . NSg/C
> as the more common plural noun . Grammatical context is one way to determine
# NSg/R D NPr/I/V/J/Dq NSg/V/J NSg/J NSg/V+ . J N🅪Sg/V+ VL NSg/I/V/J NSg/J+ P V
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
# NSg/V/J J NSg/V NPr/VX V NSg/I/C/Ddem+ . NPl/V+ . VL NSg/J/R V/J NSg/R D/P NSg/V+ . NSg/C
> as the more common plural noun . Grammatical context is one way to determine
# NSg/R D NPr/I/V/J/Dq NSg/V/J NSg/J NSg/V+ . J+ N🅪Sg/V+ VL NSg/I/V/J NSg/J P V
> this ; semantic analysis can also be used to infer that " sailor " and " hatch "
# I/Ddem+ . NSg/J N🅪Sg+ NPr/VX W? NSg/VX V/J P V NSg/I/C/Ddem+ . NSg+ . V/C . NSg/V .
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
# NSg/V . NPl/V . NSg/R # . NPr/J/P D+ J+ N🅪Sg/V+ V/C # . D/P NSg/V/J+ V/J P D
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
# NSg/V+ . NSg/V . . NPr/J/P I/Ddem+ N🅪Sg/V+ . . NPl/V+ . VL D/P J NSg/V/J+ N🅪Sg/V/J+ . V . D/P
# I/Ddem+ . NSg/J+ N🅪Sg+ NPr/VX W? NSg/VX V/J P V NSg/I/C/Ddem+ . NSg+ . V/C . NSg/V .
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
# NSg/V . NPl/V+ . NSg/R # . NPr/J/P D J N🅪Sg/V+ V/C # . D/P NSg/V/J+ V/J P D
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
# NSg/V+ . NSg/V . . NPr/J/P I/Ddem N🅪Sg/V+ . . NPl/V+ . VL D/P J NSg/V/J+ N🅪Sg/V/J+ . V . D/P
> watertight door ) securely " ) .
# J NSg/V+ . R . . .
>
@@ -76,358 +76,358 @@
# NSg/V+ NPl/V
>
#
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
# NPl/V+ R NSg/V NSg/I/C/Ddem + V # NPl/V P N🅪Sg/V+ NPr/J/P NPr🅪/V/J . NSg/V+ . NSg/V+ .
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
# NPl/V+ R NSg/V NSg/I/C/Ddem + V # NPl/V P N🅪Sg/V NPr/J/P NPr🅪/V/J . NSg/V+ . NSg/V+ .
> article , adjective , preposition , pronoun , adverb , conjunction , and interjection .
# NSg/V+ . NSg/V/J+ . NSg/V . NSg/V+ . NSg/V+ . NSg/V+ . V/C NSg+ .
> However , there are clearly many more categories and sub - categories . For nouns ,
# C . + V R NSg/I/J/Dq NPr/I/V/J/Dq NPl+ V/C NSg/V/P . NPl . C/P NPl/V .
> the plural , possessive , and singular forms can be distinguished . In many
# D NSg/J . NSg/J . V/C NSg/J NPl/V+ NPr/VX+ NSg/VX+ V/J+ . NPr/J/P NSg/I/J/Dq+
> languages words are also marked for their " case " ( role as subject , object ,
# NPl/V+ NPl/V+ V W? V/J C/P D$+ . NPr/V+ . . NSg NSg/R NSg/V/J . NSg/V+ .
> However , there are clearly many more categories and sub - categories . For nouns ,
# C . + V R NSg/I/J/Dq+ NPr/I/V/J/Dq+ NPl+ V/C NSg/V/P . NPl+ . C/P NPl/V .
> the plural , possessive , and singular forms can be distinguished . In many
# D NSg/J . NSg/J . V/C NSg/J NPl/V+ NPr/VX NSg/VX V/J . NPr/J/P NSg/I/J/Dq+
> languages words are also marked for their " case " ( role as subject , object ,
# NPl/V+ NPl/V+ V W? V/J C/P D$+ . NPr/V+ . . NSg NSg/R NSg/V/J+ . NSg/V+ .
> etc. ) , grammatical gender , and so on ; while verbs are marked for tense , aspect ,
# + . . J+ NSg/V/J+ . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V+ V V/J C/P NSg/V/J . NSg/V+ .
> and other things . In some tagging systems , different inflections of the same
# V/C NSg/V/J+ NPl/V+ . NPr/J/P I/J/R/Dq+ NSg/V NPl+ . NSg/J NPl P D+ I/J+
> and other things . In some tagging systems , different inflections of the same
# V/C NSg/V/J+ NPl/V+ . NPr/J/P I/J/R/Dq NSg/V NPl+ . NSg/J NPl P D I/J
> root word will get different parts of speech , resulting in a large number of
# NPr/V+ NSg/V+ NPr/VX NSg/V NSg/J NPl/V P N🅪Sg/V+ . V NPr/J/P D/P NSg/J NSg/V/JC P+
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
# NPl/V+ . C/P NSg/V+ . ? C/P NSg/J NSg/V/J+ NPl/V . ? C/P NSg/J NSg/V/J+ NPl/V . NPr
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
# C/P NSg/J NSg/J NPl/V . NSg/V D+ NSg+ NPl/V+ V/J NPr/J/P D+ NPr/V/J+ NSg+ . . NSg/V/J
# NPr/V+ NSg/V+ NPr/VX NSg/V NSg/J NPl/V P N🅪Sg/V+ . V NPr/J/P D/P NSg/J NSg/V/JC P
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
# NPl/V+ . C/P NSg/V+ . ? C/P NSg/J NSg/V/J NPl/V . ? C/P NSg/J NSg/V/J NPl/V . NPr
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
# C/P NSg/J NSg/J NPl/V . NSg/V D NSg+ NPl/V+ V/J NPr/J/P D NPr/V/J NSg+ . . NSg/V/J
> tagging systems use a smaller number of tags and ignore fine differences or
# NSg/V NPl+ NSg/V D/P NSg/JC NSg/V/JC P NPl/V+ V/C V NSg/V/J NSg/V NPr/C
> model them as features somewhat independent from part - of - speech .
# NSg/V/J+ NSg/IPl+ NSg/R+ NPl/V+ NSg/I NSg/J P NSg/V/J . P . N🅪Sg/V+ .
> model them as features somewhat independent from part - of - speech .
# NSg/V/J+ NSg/IPl+ NSg/R NPl/V+ NSg/I NSg/J P NSg/V/J+ . P . N🅪Sg/V+ .
>
#
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
# NPr/J/P NSg/V/J . P . N🅪Sg/V NSg/V NSg/J/P NSg/V+ . NPr/ISg+ VL NSg/J P V P # P
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
# NPr/J/P NSg/V/J+ . P . N🅪Sg/V+ NSg/V NSg/J/P NSg/V+ . NPr/ISg+ VL NSg/J P V P # P
> 150 separate parts of speech for English . Work on stochastic methods for tagging
# # NSg/V/J NPl/V P N🅪Sg/V C/P NPr🅪/V/J+ . NSg/V J/P J NPl/V C/P NSg/V
# # NSg/V/J NPl/V P N🅪Sg/V C/P NPr🅪/V/J+ . NSg/V J/P J NPl/V+ C/P NSg/V
> Koine Greek ( DeRose 1990 ) has used over 1 , 000 parts of speech and found that
# ? NPr/V/J . ? # . V V/J NSg/V/J/P # . # NPl/V P N🅪Sg/V+ V/C NSg/V NSg/I/C/Ddem
> about as many words were ambiguous in that language as in English . A
# J/P NSg/R NSg/I/J/Dq+ NPl/V+ NSg/V J NPr/J/P NSg/I/C/Ddem+ N🅪Sg/V+ NSg/R NPr/J/P NPr🅪/V/J+ . D/P
> about as many words were ambiguous in that language as in English . A
# J/P NSg/R NSg/I/J/Dq NPl/V+ NSg/V J NPr/J/P NSg/I/C/Ddem N🅪Sg/V+ NSg/R NPr/J/P NPr🅪/V/J+ . D/P
> morphosyntactic descriptor in the case of morphologically rich languages is
# ? NSg NPr/J/P D NPr/V P ? NPr/V/J NPl/V+ VL
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
# R V/J V J/R NPr/V/J/P+ NPl . NSg/I NSg/R ? C/P NSg . NSg/V+ .
> Type = common , Gender = masculine , Number = singular , Case = accusative , Animate
# NSg/V . NSg/V/J . NSg/V/J . NSg/J . NSg/V/JC . NSg/J . NPr/V . NSg/J . V/J
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
# R V/J V J/R NPr/V/J/P NPl . NSg/I NSg/R ? C/P NSg+ . NSg/V+ .
> Type = common , Gender = masculine , Number = singular , Case = accusative , Animate
# NSg/V+ . NSg/V/J . NSg/V/J+ . NSg/J . NSg/V/JC+ . NSg/J . NPr/V+ . NSg/J . V/J
> = no .
# . NPr/P .
>
#
> The most popular " tag set " for POS tagging for American English is probably the
# D NSg/I/J/Dq NSg/J . NSg/V+ NPr/V/J . C/P NSg+ NSg/V C/P NPr/J NPr🅪/V/J+ VL R D+
> The most popular " tag set " for POS tagging for American English is probably the
# D NSg/I/J/Dq NSg/J . NSg/V NPr/V/J . C/P NSg+ NSg/V C/P NPr/J NPr🅪/V/J+ VL R D
> Penn tag set , developed in the Penn Treebank project . It is largely similar to
# NPr+ NSg/V+ NPr/V/J . V/J NPr/J/P D+ NPr+ ? NSg/V+ . NPr/ISg+ VL R NSg/J P
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
# D JC NPr/V/J NSg V/C NSg/V NSg+ NSg/V+ NPl/V . V/C NSg/I/J/Dq+ NSg/JC+ . NPr/J/P
# NPr+ NSg/V+ NPr/V/J . V/J NPr/J/P D NPr+ ? NSg/V+ . NPr/ISg+ VL R NSg/J P
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
# D JC NPr/V/J NSg V/C NSg/V NSg+ NSg/V+ NPl/V . V/C NSg/I/J/Dq NSg/JC . NPr/J/P
> Europe , tag sets from the Eagles Guidelines see wide use and include versions
# NPr+ . NSg/V+ NPl/V P D+ NPl/V+ NPl+ NSg/V NSg/J NSg/V+ V/C NSg/V NPl/V
> for multiple languages .
# C/P NSg/J/Dq+ NPl/V+ .
# NPr+ . NSg/V+ NPl/V P D NPl/V NPl+ NSg/V NSg/J NSg/V+ V/C NSg/V NPl/V+
> for multiple languages .
# C/P NSg/J/Dq NPl/V+ .
>
#
> POS tagging work has been done in a variety of languages , and the set of POS
# NSg+ NSg/V NSg/V+ V NSg/V NSg/V/J NPr/J/P D/P NSg P NPl/V+ . V/C D NPr/V/J P NSg+
> tags used varies greatly with language . Tags usually are designed to include
# NPl/V+ V/J NPl/V R P N🅪Sg/V+ . NPl/V+ R V V/J P NSg/V
> overt morphological distinctions , although this leads to inconsistencies such as
# NSg/J J+ NPl+ . C I/Ddem+ NPl/V P NPl NSg/I NSg/R
> overt morphological distinctions , although this leads to inconsistencies such as
# NSg/J+ J+ NPl+ . C I/Ddem NPl/V P NPl NSg/I NSg/R
> case - marking for pronouns but not nouns in English , and much larger
# NPr/V+ . NSg/V C/P NPl/V NSg/C/P NSg/C NPl/V NPr/J/P NPr🅪/V/J+ . V/C NSg/I/J/Dq JC
> cross - language differences . The tag sets for heavily inflected languages such as
# NPr/V/J/P+ . N🅪Sg/V+ NSg/V . D+ NSg/V+ NPl/V C/P R V/J NPl/V+ NSg/I NSg/R
# NPr/V/J/P+ . N🅪Sg/V+ NSg/V+ . D+ NSg/V+ NPl/V C/P R V/J NPl/V+ NSg/I NSg/R
> Greek and Latin can be very large ; tagging words in agglutinative languages such
# NPr/V/J V/C NPr/J NPr/VX NSg/VX J/R NSg/J . NSg/V NPl/V+ NPr/J/P ? NPl/V+ NSg/I
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
# NSg/R NPr/J NPl/V+ NPr/VX NSg/VX R+ NSg/J+ . NSg/P D+ NSg/V/J+ NSg/J . ? ?
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
# NSg/R NPr/J NPl/V+ NPr/VX NSg/VX R NSg/J . NSg/P D NSg/V/J NSg/J . ? ?
> al. have proposed a " universal " tag set , with 12 categories ( for example , no
# ? NSg/VX V/J D/P . NSg/J . NSg/V+ NPr/V/J . P # NPl . C/P NSg/V+ . NPr/P
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
# NPl P NPl/V . NPl/V+ . NᴹSg+ . V/C NSg/I/J/C J/P+ . . I/C D/P J/R NPr/V/J NPr/V/J P
> very broad tags or a much larger set of more precise ones is preferable , depends
# J/R NSg/J NPl/V NPr/C D/P NSg/I/J/Dq JC NPr/V/J P NPr/I/V/J/Dq V/J NPl/V+ VL W? . NPl/V
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
# J/P D+ NSg/V NSg/P NSg/V+ . NSg/J NSg/V VL NSg/JC J/P NSg/JC NSg/V+ . NPl/V+ .
# ? NSg/VX V/J D/P . NSg/J . NSg/V+ NPr/V/J . P # NPl+ . C/P NSg/V+ . NPr/P
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
# NPl P NPl/V . NPl/V+ . NᴹSg+ . V/C NSg/I/J/C J/P . . I/C D/P J/R NPr/V/J NPr/V/J P
> very broad tags or a much larger set of more precise ones is preferable , depends
# J/R NSg/J NPl/V+ NPr/C D/P NSg/I/J/Dq JC NPr/V/J P NPr/I/V/J/Dq V/J+ NPl/V+ VL W? . NPl/V
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
# J/P D NSg/V+ NSg/P NSg/V+ . NSg/J NSg/V VL NSg/JC J/P NSg/JC NSg/V+ . NPl/V .
>
#
> History
# N🅪Sg
# N🅪Sg+
>
#
> The Brown Corpus
# D NPr/V/J+ NSg
# D+ NPr/V/J+ NSg+
>
#
> Research on part - of - speech tagging has been closely tied to corpus linguistics .
# NᴹSg/V J/P NSg/V/J . P . N🅪Sg/V NSg/V V NSg/V R V/J P NSg NᴹSg+ .
> The first major corpus of English for computer analysis was the Brown Corpus
# D NSg/V/J NPr/V/J NSg P NPr🅪/V/J+ C/P NSg/V+ N🅪Sg+ V D NPr/V/J NSg
> Research on part - of - speech tagging has been closely tied to corpus linguistics .
# NᴹSg/V J/P NSg/V/J+ . P . N🅪Sg/V+ NSg/V V NSg/V R V/J P NSg NᴹSg+ .
> The first major corpus of English for computer analysis was the Brown Corpus
# D NSg/V/J NPr/V/J NSg P NPr🅪/V/J C/P NSg/V+ N🅪Sg+ V D NPr/V/J NSg
> developed at Brown University by Henry Kučera and W. Nelson Francis , in the
# V/J NSg/P NPr/V/J NSg NSg/J/P NPr+ ? V/C ? NPr+ NPr+ . NPr/J/P D
# V/J NSg/P NPr/V/J NSg+ NSg/J/P NPr+ ? V/C ? NPr+ NPr+ . NPr/J/P D
> mid - 1960s . It consists of about 1 , 000 , 000 words of running English prose text ,
# NSg/J/P+ . #d . NPr/ISg+ NPl/V P J/P # . # . # NPl/V P NSg/V/J/P NPr🅪/V/J+ NSg/V N🅪Sg/V+ .
> made up of 500 samples from randomly chosen publications . Each sample is 2 , 000
# V NSg/V/J/P P # NPl/V+ P R+ NᴹSg/V/J NPl+ . Dq+ NSg/V+ VL # . #
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
# NPr/C NPr/I/V/J/Dq NPl/V+ . NSg/V NSg/P D NSg/V/J+ NSg/V+ . NSg/V P # . # NPl/V+ . NSg/I/J/C NSg/I/C/Ddem D+
# V NSg/V/J/P P # NPl/V+ P R NᴹSg/V/J NPl+ . Dq+ NSg/V+ VL # . #
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
# NPr/C NPr/I/V/J/Dq NPl/V+ . NSg/V NSg/P D NSg/V/J NSg/V+ . NSg/V+ P # . # NPl/V+ . NSg/I/J/C NSg/I/C/Ddem D
> corpus contains only complete sentences ) .
# NSg+ V J/R/C NSg/V/J+ NPl/V+ . .
# NSg+ V J/R/C NSg/V/J NPl/V+ . .
>
#
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
# D+ NPr/V/J NSg V R . V/J . P NSg/V/J . P . N🅪Sg/V NPl/V NSg/V/J/P
> many years . A first approximation was done with a program by Greene and Rubin ,
# NSg/I/J/Dq+ NPl+ . D/P+ NSg/V/J+ NSg+ V NSg/V/J P D/P NPr/V NSg/J/P NPr V/C NPr .
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
# D+ NPr/V/J+ NSg+ V R . V/J . P NSg/V/J+ . P . N🅪Sg/V+ NPl/V NSg/V/J/P
> many years . A first approximation was done with a program by Greene and Rubin ,
# NSg/I/J/Dq+ NPl+ . D/P+ NSg/V/J+ NSg+ V NSg/V/J P D/P+ NPr/V+ NSg/J/P NPr V/C NPr .
> which consisted of a huge handmade list of what categories could co - occur at
# I/C+ V/J P D/P J NSg/J NSg/V P NSg/I+ NPl+ NSg/VX NPr/I/V+ . V NSg/P+
# I/C+ V/J P D/P J NSg/J NSg/V P NSg/I+ NPl+ NSg/VX NPr/I/V+ . V NSg/P
> all . For example , article then noun can occur , but article then verb ( arguably )
# NSg/I/J/C/Dq . C/P NSg/V+ . NSg/V+ NSg/J/C NSg/V+ NPr/VX V . NSg/C/P NSg/V+ NSg/J/C NSg/V+ . R .
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
# NSg/V . D+ NPr/V+ V J/P # . NSg/V/J+ . ISg/D$+ NPl/V+ NSg/V R V/J
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
# NSg/V . D+ NPr/V+ V J/P # . NSg/V/J . ISg/D$+ NPl/V+ NSg/V R V/J
> and corrected by hand , and later users sent in errata so that by the late 70 s
# V/C V/J NSg/J/P NSg/V+ . V/C JC NPl+ NSg/V NPr/J/P NSg NSg/I/J/C NSg/I/C/Ddem+ NSg/J/P D NSg/J # ?
> the tagging was nearly perfect ( allowing for some cases on which even human
# D NSg/V V R NSg/V/J . V C/P I/J/R/Dq NPl/V+ J/P I/C+ NSg/V/J NSg/V/J
# D NSg/V V R NSg/V/J . V C/P I/J/R/Dq NPl/V+ J/P I/C+ NSg/V/J NSg/V/J+
> speakers might not agree ) .
# + NᴹSg/VX/J NSg/C V . .
>
#
> This corpus has been used for innumerable studies of word - frequency and of
# I/Ddem+ NSg V NSg/V V/J C/P J NPl/V P NSg/V+ . NSg V/C P
> part - of - speech and inspired the development of similar " tagged " corpora in many
# NSg/V/J . P . N🅪Sg/V V/C V/J D N🅪Sg P NSg/J . V/J . NPl NPr/J/P NSg/I/J/Dq+
> other languages . Statistics derived by analyzing it formed the basis for most
# NSg/V/J+ NPl/V+ . NPl/V+ V/J NSg/J/P V NPr/ISg+ V/J D NSg C/P NSg/I/J/Dq
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
# JC NSg/V/J . P . N🅪Sg/V NSg/V NPl . NSg/I NSg/R NPl/V+ V/C ? . C . NSg/J/P
# I/Ddem+ NSg+ V NSg/V V/J C/P J NPl/V P NSg/V+ . NSg V/C P
> part - of - speech and inspired the development of similar " tagged " corpora in many
# NSg/V/J+ . P . N🅪Sg/V+ V/C V/J D N🅪Sg P NSg/J . V/J . NPl+ NPr/J/P NSg/I/J/Dq
> other languages . Statistics derived by analyzing it formed the basis for most
# NSg/V/J NPl/V+ . NPl/V+ V/J NSg/J/P V NPr/ISg+ V/J D+ NSg+ C/P NSg/I/J/Dq
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
# JC NSg/V/J+ . P . N🅪Sg/V+ NSg/V NPl+ . NSg/I NSg/R NPl/V+ V/C ? . C . NSg/J/P
> this time ( 2005 ) it has been superseded by larger corpora such as the 100
# I/Ddem+ N🅪Sg/V/J+ . # . NPr/ISg+ V NSg/V V/J NSg/J/P JC NPl+ NSg/I NSg/R D #
> million word British National Corpus , even though larger corpora are rarely so
# NSg NSg/V+ NPr/J NSg/J+ NSg+ . NSg/V/J V/C JC+ NPl+ V R NSg/I/J/C
# NSg NSg/V+ NPr/J NSg/J NSg+ . NSg/V/J V/C JC NPl+ V R NSg/I/J/C
> thoroughly curated .
# R+ V/J+ .
# R V/J .
>
#
> For some time , part - of - speech tagging was considered an inseparable part of
# C/P I/J/R/Dq N🅪Sg/V/J . NSg/V/J . P . N🅪Sg/V NSg/V V V/J D/P NSg/J NSg/V/J P
> For some time , part - of - speech tagging was considered an inseparable part of
# C/P I/J/R/Dq N🅪Sg/V/J+ . NSg/V/J+ . P . N🅪Sg/V+ NSg/V V V/J D/P NSg/J NSg/V/J P
> natural language processing , because there are certain cases where the correct
# NSg/J+ N🅪Sg/V+ V+ . C/P + V I/J NPl/V+ NSg/C D NSg/V/J
# NSg/J N🅪Sg/V+ V+ . C/P + V I/J NPl/V+ NSg/C D NSg/V/J
> part of speech cannot be decided without understanding the semantics or even the
# NSg/V/J P N🅪Sg/V+ NSg/V NSg/VX NSg/V/J C/P NᴹSg/V/J+ D+ NPl NPr/C NSg/V/J D
# NSg/V/J P N🅪Sg/V+ NSg/V NSg/VX NSg/V/J C/P NᴹSg/V/J+ D NPl+ NPr/C NSg/V/J D
> pragmatics of the context . This is extremely expensive , especially because
# NPl P D+ N🅪Sg/V+ . I/Ddem+ VL R J . R C/P
# NPl P D N🅪Sg/V+ . I/Ddem+ VL R J . R C/P
> analyzing the higher levels is much harder when multiple part - of - speech
# V D+ NSg/JC+ NPl/V+ VL NSg/I/J/Dq JC NSg/I/C NSg/J/Dq NSg/V/J . P . N🅪Sg/V
# V D+ NSg/JC+ NPl/V+ VL NSg/I/J/Dq JC NSg/I/C NSg/J/Dq NSg/V/J . P . N🅪Sg/V+
> possibilities must be considered for each word .
# NPl NSg/V NSg/VX V/J C/P Dq+ NSg/V+ .
# NPl+ NSg/V NSg/VX V/J C/P Dq+ NSg/V+ .
>
#
> Use of hidden Markov models
# NSg/V P V/J NPr+ NPl/V
# NSg/V P V/J NPr NPl/V+
>
#
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
# NPr/J/P D NSg/J/P . #d . NPl NPr/J/P NPr+ V P NSg/V V/J NPr NPl/V+ . ? .
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
# NPr/J/P D NSg/J/P+ . #d . NPl NPr/J/P NPr+ V P NSg/V V/J NPr NPl/V+ . ? .
> to disambiguate parts of speech , when working to tag the Lancaster - Oslo - Bergen
# P V NPl/V P N🅪Sg/V+ . NSg/I/C V P NSg/V D NPr . NPr+ . NPr
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
# NSg P NPr/J+ NPr🅪/V/J+ . ? V V NPl/V . NSg/I NSg/R P D+ NPr/V/J+
# P V NPl/V P N🅪Sg/V+ . NSg/I/C V P NSg/V D NPr . NPr+ . NPr+
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
# NSg P NPr/J NPr🅪/V/J+ . ? V V NPl/V+ . NSg/I NSg/R P D NPr/V/J
> Corpus ) and making a table of the probabilities of certain sequences . For
# NSg+ . V/C NSg/V D/P NSg/V P D NPl P I/J+ NPl/V+ . C/P
> example , once you've seen an article such as ' the ' , perhaps the next word is a
# NSg/V+ . NSg/C W? NSg/V D/P NSg/V+ NSg/I NSg/R . D . . NSg D+ NSg/J/P+ NSg/V+ VL D/P
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
# NSg/V # . P D+ N🅪Sg/V/J+ . D/P+ NSg/V/J+ # . . V/C D/P+ NSg/V/JC+ # . . NSg/V/J/P I/Ddem+ . D/P+
# NSg+ . V/C NSg/V D/P NSg/V P D NPl P I/J NPl/V+ . C/P
> example , once you've seen an article such as ' the ' , perhaps the next word is a
# NSg/V+ . NSg/C W? NSg/V D/P NSg/V+ NSg/I NSg/R . D . . NSg D NSg/J/P NSg/V+ VL D/P
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
# NSg/V+ # . P D N🅪Sg/V/J+ . D/P NSg/V/J+ # . . V/C D/P NSg/V/JC+ # . . NSg/V/J/P I/Ddem+ . D/P+
> program can decide that " can " in " the can " is far more likely to be a noun than
# NPr/V+ NPr/VX V NSg/I/C/Ddem+ . NPr/VX . NPr/J/P . D+ NPr/VX . VL NSg/V/J NPr/I/V/J/Dq NSg/J P NSg/VX D/P NSg/V C/P
> a verb or a modal . The same method can , of course , be used to benefit from
# D/P NSg/V NPr/C D/P+ NSg/J+ . D+ I/J+ NSg/V+ NPr/VX . P NSg/V+ . NSg/VX V/J P NSg/V P
> knowledge about the following words .
# NᴹSg+ J/P D+ NSg/V/J/P+ NPl/V .
> a verb or a modal . The same method can , of course , be used to benefit from
# D/P+ NSg/V+ NPr/C D/P NSg/J . D+ I/J+ NSg/V+ NPr/VX . P NSg/V+ . NSg/VX V/J P NSg/V P
> knowledge about the following words .
# NᴹSg+ J/P D+ NSg/V/J/P NPl/V+ .
>
#
> More advanced ( " higher - order " ) HMMs learn the probabilities not only of pairs
# NPr/I/V/J/Dq V/J . . NSg/JC . NSg/V . . ? NSg/V D+ NPl+ NSg/C J/R/C P NPl/V+
# NPr/I/V/J/Dq V/J . . NSg/JC . NSg/V . . ? NSg/V D NPl+ NSg/C J/R/C P NPl/V+
> but triples or even larger sequences . So , for example , if you've just seen a
# NSg/C/P NPl/V NPr/C NSg/V/J JC NPl/V+ . NSg/I/J/C . C/P NSg/V+ . NSg/C W? V/J NSg/V D/P
> noun followed by a verb , the next item may be very likely a preposition ,
# NSg/V V/J NSg/J/P D/P+ NSg/V+ . D+ NSg/J/P+ NSg/V+ NPr/VX NSg/VX J/R NSg/J D/P NSg/V .
> article , or noun , but much less likely another verb .
# NSg/V+ . NPr/C NSg/V+ . NSg/C/P NSg/I/J/Dq V/J/C/P NSg/J+ I/D NSg/V .
> noun followed by a verb , the next item may be very likely a preposition ,
# NSg/V+ V/J NSg/J/P D/P NSg/V+ . D NSg/J/P NSg/V+ NPr/VX NSg/VX J/R NSg/J D/P NSg/V .
> article , or noun , but much less likely another verb .
# NSg/V+ . NPr/C NSg/V+ . NSg/C/P NSg/I/J/Dq V/J/C/P NSg/J I/D NSg/V+ .
>
#
> When several ambiguous words occur together , the possibilities multiply .
# NSg/I/C J/Dq J NPl/V+ V J . D+ NPl NSg/V+ .
# NSg/I/C J/Dq+ J+ NPl/V+ V J . D+ NPl+ NSg/V .
> However , it is easy to enumerate every combination and to assign a relative
# C . NPr/ISg+ VL NSg/V/J P V Dq+ N🅪Sg+ V/C P NSg/V D/P NSg/J
> probability to each one , by multiplying together the probabilities of each
# NSg P Dq+ NSg/I/V/J+ . NSg/J/P V J D NPl P Dq+
> choice in turn . The combination with the highest probability is then chosen . The
# NSg/J+ NPr/J/P NSg/V . D N🅪Sg P D+ JS+ NSg+ VL NSg/J/C+ NᴹSg/V/J . D+
> European group developed CLAWS , a tagging program that did exactly this and
# NSg/J+ NSg/V+ V/J NPl/V+ . D/P NSg/V+ NPr/V+ NSg/I/C/Ddem+ V R I/Ddem+ V/C
# NSg+ P Dq NSg/I/V/J+ . NSg/J/P V J D NPl P Dq
> choice in turn . The combination with the highest probability is then chosen . The
# NSg/J+ NPr/J/P NSg/V . D N🅪Sg P D+ JS+ NSg+ VL NSg/J/C NᴹSg/V/J . D+
> European group developed CLAWS , a tagging program that did exactly this and
# NSg/J+ NSg/V+ V/J NPl/V+ . D/P NSg/V NPr/V+ NSg/I/C/Ddem+ V R I/Ddem V/C
> achieved accuracy in the 93 95 % range .
# V/J N🅪Sg+ NPr/J/P D # . # . NSg/V+ .
>
#
> Eugene Charniak points out in Statistical techniques for natural language
# NPr+ ? NPl/V+ NSg/V/J/R/P NPr/J/P J NPl C/P NSg/J N🅪Sg/V+
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
# V . # . NSg/I/C/Ddem+ R V D NSg/I/J/Dq NSg/V/J NSg/V P Dq+ V/J NSg/V V/C
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
# D NSg/V+ . NSg/J NSg/V . P NSg/I/J/C/Dq+ NPl/V+ NPr/VX NSg/V # . N🅪Sg+ C/P NSg/I/J/Dq+
> words are unambiguous , and many others only rarely represent their less - common
# NPl/V+ V J . V/C NSg/I/J/Dq+ NPl/V+ J/R/C R V D$+ V/J/C/P . NSg/V/J
# NPr+ ? NPl/V+ NSg/V/J/R/P NPr/J/P J NPl C/P NSg/J+ N🅪Sg/V+
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
# V . # . NSg/I/C/Ddem+ R V D NSg/I/J/Dq NSg/V/J NSg/V+ P Dq V/J NSg/V+ V/C
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
# D NSg/V+ . NSg/J NSg/V+ . P NSg/I/J/C/Dq NPl/V+ NPr/VX NSg/V+ # . N🅪Sg+ C/P NSg/I/J/Dq
> words are unambiguous , and many others only rarely represent their less - common
# NPl/V+ V J . V/C NSg/I/J/Dq NPl/V+ J/R/C R V D$+ V/J/C/P . NSg/V/J
> parts of speech .
# NPl/V P N🅪Sg/V+ .
>
#
> CLAWS pioneered the field of HMM - based part of speech tagging but was quite
# NPl/V+ V/J D NSg/V P V . V/J NSg/V/J P N🅪Sg/V+ NSg/V NSg/C/P V NSg
> expensive since it enumerated all possibilities . It sometimes had to resort to
# J C/P NPr/ISg+ V/J NSg/I/J/C/Dq+ NPl+ . NPr/ISg+ R V P NSg/V P
> backup methods when there were simply too many options ( the Brown Corpus
# NSg/J NPl/V+ NSg/I/C + NSg/V R W? NSg/I/J/Dq+ NPl/V . D+ NPr/V/J+ NSg+
> contains a case with 17 ambiguous words in a row , and there are words such as
# V D/P NPr/V P # J NPl/V NPr/J/P D/P+ NSg/V+ . V/C + V NPl/V+ NSg/I NSg/R
> expensive since it enumerated all possibilities . It sometimes had to resort to
# J C/P NPr/ISg+ V/J NSg/I/J/C/Dq NPl+ . NPr/ISg+ R V P NSg/V P
> backup methods when there were simply too many options ( the Brown Corpus
# NSg/J NPl/V+ NSg/I/C + NSg/V R W? NSg/I/J/Dq NPl/V . D+ NPr/V/J+ NSg+
> contains a case with 17 ambiguous words in a row , and there are words such as
# V D/P NPr/V+ P # J NPl/V NPr/J/P D/P+ NSg/V+ . V/C + V NPl/V+ NSg/I NSg/R
> " still " that can represent as many as 7 distinct parts of speech .
# . NSg/V/J . NSg/I/C/Ddem+ NPr/VX V NSg/R NSg/I/J/Dq NSg/R # V/J NPl/V P N🅪Sg/V+ .
>
#
> HMMs underlie the functioning of stochastic taggers and are used in various
# ? V D V P J NPl V/C V V/J NPr/J/P J
# ? V D V+ P J NPl V/C V V/J NPr/J/P J
> algorithms one of the most widely used being the bi - directional inference
# NPl+ NSg/I/V/J P D NSg/I/J/Dq R V/J NSg/V/C D NSg/J . NSg/J NSg+
> algorithm .
# NSg+ .
# NSg .
>
#
> Dynamic programming methods
# NSg/J+ NᴹSg/V+ NPl/V
# NSg/J+ NᴹSg/V+ NPl/V+
>
#
> In 1987 , Steven DeRose and Kenneth W. Church independently developed dynamic
# NPr/J/P # . NPr+ ? V/C NPr+ ? NPr/V+ R V/J NSg/J
> programming algorithms to solve the same problem in vastly less time . Their
# NᴹSg/V+ NPl+ P NSg/V D I/J NSg/J NPr/J/P R V/J/C/P N🅪Sg/V/J+ . D$+
# NᴹSg/V+ NPl+ P NSg/V D I/J NSg/J+ NPr/J/P R V/J/C/P N🅪Sg/V/J+ . D$+
> methods were similar to the Viterbi algorithm known for some time in other
# NPl/V+ NSg/V NSg/J P D ? NSg V/J C/P I/J/R/Dq N🅪Sg/V/J+ NPr/J/P NSg/V/J+
# NPl/V+ NSg/V NSg/J P D ? NSg V/J C/P I/J/R/Dq N🅪Sg/V/J+ NPr/J/P NSg/V/J
> fields . DeRose used a table of pairs , while Church used a table of triples and a
# NPrPl/V+ . ? V/J D/P NSg/V P NPl/V+ . NSg/V/C/P NPr/V+ V/J D/P NSg/V P NPl/V V/C D/P
> method of estimating the values for triples that were rare or nonexistent in the
# NSg/V P V D NPl/V C/P NPl/V NSg/I/C/Ddem+ NSg/V NSg/V/J NPr/C NSg/J NPr/J/P D+
> Brown Corpus ( an actual measurement of triple probabilities would require a much
# NPr/V/J+ NSg . D/P NSg/J NSg P NSg/V/J NPl+ VX NSg/V D/P NSg/I/J/Dq
# NSg/V P V D NPl/V+ C/P NPl/V NSg/I/C/Ddem+ NSg/V NSg/V/J NPr/C NSg/J NPr/J/P D
> Brown Corpus ( an actual measurement of triple probabilities would require a much
# NPr/V/J NSg+ . D/P NSg/J NSg P NSg/V/J NPl+ VX NSg/V D/P NSg/I/J/Dq
> larger corpus ) . Both methods achieved an accuracy of over 95 % . DeRose's 1990
# JC NSg+ . . I/C/Dq NPl/V+ V/J D/P N🅪Sg P NSg/V/J/P # . . ? #
# JC NSg+ . . I/C/Dq NPl/V+ V/J D/P N🅪Sg+ P NSg/V/J/P # . . ? #
> dissertation at Brown University included analyses of the specific error types ,
# NSg+ NSg/P NPr/V/J NSg+ V/J NPl/V/Au/Br P D+ NSg/J+ NSg/V+ NPl/V+ .
> probabilities , and other related data , and replicated his work for Greek , where
# NPl+ . V/C NSg/V/J+ J+ N🅪Pl+ . V/C V/J ISg/D$+ NSg/V C/P NPr/V/J . NSg/C
# NSg+ NSg/P NPr/V/J NSg+ V/J NPl/V/Au/Br P D NSg/J NSg/V+ NPl/V+ .
> probabilities , and other related data , and replicated his work for Greek , where
# NPl+ . V/C NSg/V/J J N🅪Pl+ . V/C V/J ISg/D$+ NSg/V+ C/P NPr/V/J . NSg/C
> it proved similarly effective .
# NPr/ISg+ V/J R+ NSg/J .
# NPr/ISg+ V/J R NSg/J .
>
#
> These findings were surprisingly disruptive to the field of natural language
# I/Ddem+ NSg NSg/V R J P D NSg/V P NSg/J+ N🅪Sg/V+
# I/Ddem+ NSg+ NSg/V R J P D NSg/V P NSg/J+ N🅪Sg/V+
> processing . The accuracy reported was higher than the typical accuracy of very
# V+ . D+ N🅪Sg+ V/J V NSg/JC C/P D NSg/J N🅪Sg P J/R
> sophisticated algorithms that integrated part of speech choice with many higher
# V/J NPl+ NSg/I/C/Ddem+ V/J NSg/V/J P N🅪Sg/V+ NSg/J P NSg/I/J/Dq NSg/JC
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
# NPl/V P J N🅪Sg+ . NSg+ . NSg+ . NPl+ . V/C NSg/I/J/C+ J/P . NPl/V .
# V/J+ NPl+ NSg/I/C/Ddem+ V/J NSg/V/J P N🅪Sg/V+ NSg/J+ P NSg/I/J/Dq NSg/JC
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
# NPl/V P J N🅪Sg . NSg+ . NSg+ . NPl+ . V/C NSg/I/J/C J/P . NPl/V+ .
> DeRose's and Church's methods did fail for some of the known cases where
# ? V/C NSg$ NPl/V+ V NSg/V/J C/P I/J/R/Dq P D+ V/J+ NPl/V+ NSg/C
> semantics is required , but those proved negligibly rare . This convinced many in
# NPl+ VL V/J . NSg/C/P I/Ddem+ V/J R+ NSg/V/J+ . I/Ddem+ V/J NSg/I/J/Dq NPr/J/P
> the field that part - of - speech tagging could usefully be separated from the other
# D+ NSg/V+ NSg/I/C/Ddem+ NSg/V/J . P . N🅪Sg/V NSg/V NSg/VX R NSg/VX V/J P D NSg/V/J
# ? V/C NSg$ NPl/V+ V NSg/V/J C/P I/J/R/Dq P D V/J NPl/V+ NSg/C
> semantics is required , but those proved negligibly rare . This convinced many in
# NPl+ VL V/J . NSg/C/P I/Ddem V/J R NSg/V/J . I/Ddem V/J NSg/I/J/Dq NPr/J/P
> the field that part - of - speech tagging could usefully be separated from the other
# D+ NSg/V+ NSg/I/C/Ddem+ NSg/V/J+ . P . N🅪Sg/V+ NSg/V NSg/VX R NSg/VX V/J P D NSg/V/J
> levels of processing ; this , in turn , simplified the theory and practice of
# NPl/V P V . I/Ddem+ . NPr/J/P NSg/V . V/J D+ NSg V/C NSg/V P
# NPl/V P V+ . I/Ddem+ . NPr/J/P NSg/V . V/J D NSg V/C NSg/V P
> computerized language analysis and encouraged researchers to find ways to
# V/J N🅪Sg/V+ N🅪Sg+ V/C V/J NPl+ P NSg/V NPl+ P
> separate other pieces as well . Markov Models became the standard method for the
# NSg/V/J NSg/V/J+ NPl/V+ NSg/R+ NSg/V/J . NPr NPl/V+ V D NSg/J NSg/V C/P D
> part - of - speech assignment .
# NSg/V/J . P . N🅪Sg/V+ NSg+ .
> separate other pieces as well . Markov Models became the standard method for the
# NSg/V/J NSg/V/J NPl/V+ NSg/R NSg/V/J . NPr NPl/V+ V D NSg/J NSg/V+ C/P D
> part - of - speech assignment .
# NSg/V/J+ . P . N🅪Sg/V+ NSg+ .
>
#
> Unsupervised taggers
# V/J+ NPl
# V/J NPl
>
#
> The methods already discussed involve working from a pre - existing corpus to
# D+ NPl/V W? V/J V V P D/P NSg/V/P+ . V NSg P
> The methods already discussed involve working from a pre - existing corpus to
# D+ NPl/V+ W? V/J V V P D/P+ NSg/V/P+ . V NSg+ P
> learn tag probabilities . It is , however , also possible to bootstrap using
# NSg/V NSg/V+ NPl+ . NPr/ISg+ VL . C . W? NSg/J P NSg/V V
> " unsupervised " tagging . Unsupervised tagging techniques use an untagged corpus
# . V/J . NSg/V . V/J NSg/V NPl+ NSg/V D/P J NSg
> for their training data and produce the tagset by induction . That is , they
# C/P D$+ NSg/V+ N🅪Pl+ V/C NSg/V D NSg NSg/J/P+ NSg . NSg/I/C/Ddem+ VL . IPl+
> observe patterns in word use , and derive part - of - speech categories themselves .
# NSg/V NPl/V+ NPr/J/P NSg/V+ NSg/V . V/C NSg/V NSg/V/J . P . N🅪Sg/V NPl+ IPl+ .
# . V/J . NSg/V . V/J NSg/V NPl+ NSg/V D/P J NSg+
> for their training data and produce the tagset by induction . That is , they
# C/P D$+ NSg/V+ N🅪Pl+ V/C NSg/V D NSg NSg/J/P NSg . NSg/I/C/Ddem+ VL . IPl+
> observe patterns in word use , and derive part - of - speech categories themselves .
# NSg/V NPl/V+ NPr/J/P NSg/V+ NSg/V . V/C NSg/V NSg/V/J+ . P . N🅪Sg/V+ NPl+ IPl+ .
> For example , statistics readily reveal that " the " , " a " , and " an " occur in
# C/P NSg/V+ . NPl/V+ R NSg/V NSg/I/C/Ddem+ . D . . . D/P . . V/C . D/P . V NPr/J/P
> similar contexts , while " eat " occurs in very different ones . With sufficient
# NSg/J+ NPl/V+ . NSg/V/C/P . V . V NPr/J/P J/R NSg/J+ NPl/V+ . P J+
# NSg/J+ NPl/V+ . NSg/V/C/P . V . V NPr/J/P J/R NSg/J+ NPl/V+ . P J
> iteration , similarity classes of words emerge that are remarkably similar to
# NSg . NSg NPl/V P NPl/V+ NSg/V NSg/I/C/Ddem+ V R NSg/J P
> those human linguists would expect ; and the differences themselves sometimes
# I/Ddem+ NSg/V/J NPl+ VX V . V/C D+ NSg/V+ IPl+ R
> suggest valuable new insights .
# V NSg/J+ NSg/V/J+ NPl+ .
> those human linguists would expect ; and the differences themselves sometimes
# I/Ddem NSg/V/J NPl+ VX V . V/C D NSg/V+ IPl+ R
> suggest valuable new insights .
# V NSg/J NSg/V/J NPl+ .
>
#
> These two categories can be further subdivided into rule - based , stochastic , and
# I/Ddem NSg+ NPl NPr/VX NSg/VX V/J V/J P NSg/V . V/J . J . V/C
> These two categories can be further subdivided into rule - based , stochastic , and
# I/Ddem+ NSg+ NPl+ NPr/VX NSg/VX V/J V/J P NSg/V+ . V/J . J . V/C
> neural approaches .
# J+ NPl/V+ .
# J NPl/V+ .
>
#
> Other taggers and methods
# NSg/V/J+ NPl V/C NPl/V
> Other taggers and methods
# NSg/V/J NPl V/C NPl/V+
>
#
> Some current major algorithms for part - of - speech tagging include the Viterbi
# I/J/R/Dq+ NSg/J NPr/V/J NPl C/P NSg/V/J . P . N🅪Sg/V NSg/V NSg/V D ?
> Some current major algorithms for part - of - speech tagging include the Viterbi
# I/J/R/Dq NSg/J NPr/V/J NPl C/P NSg/V/J+ . P . N🅪Sg/V+ NSg/V NSg/V D ?
> algorithm , Brill tagger , Constraint Grammar , and the Baum - Welch algorithm ( also
# NSg . NSg/J NSg . NSg+ NSg/V+ . V/C D NPr . ? NSg . W?
> known as the forward - backward algorithm ) . Hidden Markov model and visible Markov
# V/J NSg/R D NSg/V/J . NSg/J NSg+ . . V/J NPr NSg/V/J+ V/C J NPr
# V/J NSg/R D NSg/V/J . NSg/J NSg . . V/J NPr NSg/V/J+ V/C J NPr
> model taggers can both be implemented using the Viterbi algorithm . The
# NSg/V/J+ NPl NPr/VX I/C/Dq NSg/VX V/J V D+ ? NSg . D
# NSg/V/J+ NPl NPr/VX I/C/Dq NSg/VX V/J V D ? NSg . D+
> rule - based Brill tagger is unusual in that it learns a set of rule patterns , and
# NSg/V+ . V/J NSg/J NSg VL NSg/J NPr/J/P NSg/I/C/Ddem NPr/ISg+ NPl/V D/P NPr/V/J P NSg/V+ NPl/V+ . V/C
> then applies those patterns rather than optimizing a statistical quantity .
# NSg/J/C V I/Ddem+ NPl/V+ NPr/V/J C/P V D/P+ J+ NSg+ .
> then applies those patterns rather than optimizing a statistical quantity .
# NSg/J/C V I/Ddem NPl/V+ NPr/V/J C/P V D/P J NSg+ .
>
#
> Many machine learning methods have also been applied to the problem of POS
# NSg/I/J/Dq+ NSg/V V+ NPl/V+ NSg/VX W? NSg/V V/J P D NSg/J P NSg+
# NSg/I/J/Dq+ NSg/V+ V+ NPl/V+ NSg/VX W? NSg/V V/J P D NSg/J P NSg+
> tagging . Methods such as SVM , maximum entropy classifier , perceptron , and
# NSg/V+ . NPl/V+ NSg/I NSg/R ? . NSg/J NSg NSg . NSg . V/C
> nearest - neighbor have all been tried , and most can achieve accuracy above
# JS . NSg/V/J/Am NSg/VX NSg/I/J/C/Dq NSg/V V/J . V/C NSg/I/J/Dq NPr/VX V N🅪Sg+ NSg/J/P
# NSg/V . NPl/V+ NSg/I NSg/R ? . NSg/J NSg NSg . NSg . V/C
> nearest - neighbor have all been tried , and most can achieve accuracy above
# JS . NSg/V/J/Am+ NSg/VX NSg/I/J/C/Dq NSg/V V/J . V/C NSg/I/J/Dq NPr/VX V N🅪Sg+ NSg/J/P
> 95 % . [ citation needed ]
# # . . . NSg+ V/J+ .
# # . . . NSg+ V/J .
>
#
> A direct comparison of several methods is reported ( with references ) at the ACL
# D/P V/J NSg P J/Dq+ NPl/V+ VL V/J . P NPl/V+ . NSg/P D+ NSg+
# D/P V/J NSg P J/Dq+ NPl/V+ VL V/J . P NPl/V+ . NSg/P D NSg
> Wiki . This comparison uses the Penn tag set on some of the Penn Treebank data ,
# NSg/V+ . I/Ddem+ NSg+ NPl/V D+ NPr+ NSg/V+ NPr/V/J J/P I/J/R/Dq P D+ NPr+ ? N🅪Pl+ .
# NSg/V+ . I/Ddem+ NSg+ NPl/V D+ NPr+ NSg/V+ NPr/V/J J/P I/J/R/Dq P D NPr+ ? N🅪Pl+ .
> so the results are directly comparable . However , many significant taggers are
# NSg/I/J/C D+ NPl/V+ V R/C NSg/J+ . C . NSg/I/J/Dq NSg/J NPl V
# NSg/I/J/C D NPl/V+ V R/C NSg/J . C . NSg/I/J/Dq NSg/J NPl V
> not included ( perhaps because of the labor involved in reconfiguring them for
# NSg/C V/J . NSg C/P P D+ NPr/V/Am/Au+ V/J NPr/J/P V NSg/IPl+ C/P
> this particular dataset ) . Thus , it should not be assumed that the results
# I/Ddem+ NSg/J+ NSg . . NSg . NPr/ISg+ VX NSg/C NSg/VX V/J NSg/I/C/Ddem D+ NPl/V+
# NSg/C V/J . NSg C/P P D NPr/V/Am/Au+ V/J NPr/J/P V NSg/IPl+ C/P
> this particular dataset ) . Thus , it should not be assumed that the results
# I/Ddem NSg/J NSg . . NSg . NPr/ISg+ VX NSg/C NSg/VX V/J NSg/I/C/Ddem D+ NPl/V+
> reported here are the best that can be achieved with a given approach ; nor even
# V/J NSg/J/R V D NPr/VX/JS NSg/I/C/Ddem+ NPr/VX NSg/VX V/J P D/P+ NSg/V/J/P+ NSg/V+ . NSg/C NSg/V/J
> the best that have been achieved with a given approach .
@@ -435,6 +435,6 @@
>
#
> In 2014 , a paper reporting using the structure regularization method for
# NPr/J/P # . D/P+ N🅪Sg/V/J+ V V D+ NSg/V+ N🅪Sg NSg/V C/P
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
# NSg/V/J . P . N🅪Sg/V NSg/V . V # . J/P D/P NSg/J+ NSg/V+ NSg .
# NPr/J/P # . D/P+ N🅪Sg/V/J+ V V D NSg/V+ N🅪Sg NSg/V C/P
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
# NSg/V/J+ . P . N🅪Sg/V+ NSg/V . V # . J/P D/P NSg/J NSg/V NSg .

View File

@@ -3,15 +3,15 @@
>
#
> This document contains a list of words spelled correctly in some dialects of English , but not American English . This is designed to test the spelling suggestions we give for such mistakes .
# I/Ddem+ NSg/V V D/P NSg/V P NPl/V+ V/J R NPr/J/P I/J/R/Dq NPl P NPr🅪/V/J+ . NSg/C/P NSg/C NPr/J+ NPr🅪/V/J+ . I/Ddem+ VL V/J P NSg/V D+ NSg/V+ NPl+ IPl+ NSg/V C/P NSg/I+ NPl/V+ .
# I/Ddem+ NSg/V+ V D/P NSg/V P NPl/V+ V/J R NPr/J/P I/J/R/Dq NPl P NPr🅪/V/J+ . NSg/C/P NSg/C NPr/J NPr🅪/V/J+ . I/Ddem+ VL V/J P NSg/V D+ NSg/V+ NPl+ IPl+ NSg/V C/P NSg/I+ NPl/V+ .
>
#
> To achieve this , the filename of this file contains `.US , which will tell the snapshot generator to use the American dialect , rather than trying to use an automatically detected dialect .
# P V I/Ddem . D NSg P I/Ddem+ NSg/V+ V Unlintable . I/C+ NPr/VX NPr/V D+ NSg/V+ NSg P NSg/V D+ NPr/J+ NSg+ . NPr/V/J C/P NSg/V/J P NSg/V D/P W? V/J NSg+ .
> To achieve this , the filename of this file contains `.US , which will tell the snapshot generator to use the American dialect , rather than trying to use an automatically detected dialect .
# P V I/Ddem+ . D NSg P I/Ddem NSg/V+ V Unlintable . I/C+ NPr/VX NPr/V D NSg/V+ NSg P NSg/V D NPr/J NSg+ . NPr/V/J C/P NSg/V/J P NSg/V D/P W? V/J NSg+ .
>
#
> Words
# NPl/V
# NPl/V+
>
#
>
@@ -26,36 +26,36 @@
# NSg/V/Comm+ .
>
#
> Labelled .
# V/J/Comm+ .
> Labelled .
# V/J/Comm .
>
#
> Flavour .
# N🅪Sg/V/Comm+ .
>
#
> Favoured .
# V/J/Comm+ .
> Favoured .
# V/J/Comm .
>
#
> Honour .
# N🅪Sg/V/Comm+ .
>
#
> Grey .
# NPr/V/J/Comm+ .
> Grey .
# NPr/V/J/Comm .
>
#
> Quarrelled .
# V/Comm+ .
# V/Comm .
>
#
> Quarrelling .
# NᴹSg/V/Comm+ .
> Quarrelling .
# NᴹSg/V/Comm .
>
#
> Recognised .
# V/J/Au/Br+ .
# V/J/Au/Br .
>
#
> Neighbour .
@@ -63,11 +63,11 @@
>
#
> Neighbouring .
# V/Comm+ .
# V/Comm .
>
#
> Clamour .
# NSg/V/Comm+ .
> Clamour .
# NSg/V/Comm .
>
#
> Theatre .

View File

@@ -3,15 +3,15 @@
>
#
> This document contains example sentences with misspelled words that we want to test the spell checker on .
# I/Ddem+ NSg/V V NSg/V+ NPl/V P V/J+ NPl/V+ NSg/I/C/Ddem+ IPl+ NSg/V P NSg/V D NSg/V NSg/V J/P .
# I/Ddem+ NSg/V+ V NSg/V+ NPl/V+ P V/J NPl/V+ NSg/I/C/Ddem+ IPl+ NSg/V P NSg/V D NSg/V NSg/V J/P .
>
#
> Example Sentences
# NSg/V+ NPl/V
# NSg/V+ NPl/V+
>
#
> My favourite color is blu .
# D$+ NSg/V/J/Comm N🅪Sg/V/J/Am VL+ W? .
> My favourite color is blu .
# D$+ NSg/V/J/Comm+ N🅪Sg/V/J/Am+ VL W? .
> I must defend my honour !
# ISg+ NSg/V NSg/V D$+ N🅪Sg/V/Comm+ .
> I recognize that you recognise me .

View File

@@ -2,17 +2,17 @@
# NPl/V
>
#
> This documents tests that different forms / variations of swears are tagged as such .
# I/Ddem+ NPl/V+ NPl/V+ NSg/I/C/Ddem NSg/J NPl/V . W? P NPl/V V V/J NSg/R NSg/I .
> This documents tests that different forms / variations of swears are tagged as such .
# I/Ddem+ NPl/V+ NPl/V+ NSg/I/C/Ddem NSg/J+ NPl/V+ . W? P NPl/V V V/J NSg/R NSg/I .
>
#
> Examples
# NPl/V
# NPl/V+
>
#
> One turd , two turds .
# NSg/I/V/J+ NSg/V/B . NSg+ NPl/V/B .
> One turd , two turds .
# NSg/I/V/J+ NSg/V+/B . NSg NPl/V/B .
>
#
> I fart , you're farting , he farts , she farted .
# ISg+ NSg/V/B . + V/B . NPr/ISg+ NPl/V/B . ISg+ V/J+/B .
# ISg+ NSg/V/B . + V/B . NPr/ISg+ NPl/V/B . ISg+ V/J/B .

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +1,11 @@
> " This " and " that " are common and fulfill multiple purposes in everyday English .
# . I/Ddem+ . V/C . NSg/I/C/Ddem+ . V NSg/V/J V/C V/NoAm NSg/J/Dq NPl/V NPr/J/P NSg/J+ NPr🅪/V/J+ .
> As such , disambiguating them is necessary .
# NSg/R NSg/I . V NSg/IPl+ VL+ NSg/J .
> As such , disambiguating them is necessary .
# NSg/R NSg/I . V NSg/IPl+ VL NSg/J .
>
#
> This document contains various sentences that use " this " , " that " , " these " , and
# I/Ddem+ NSg/V V J NPl/V+ NSg/I/C/Ddem+ NSg/V . I/Ddem+ . . . NSg/I/C/Ddem+ . . . I/Ddem+ . . V/C
> This document contains various sentences that use " this " , " that " , " these " , and
# I/Ddem+ NSg/V+ V J+ NPl/V+ NSg/I/C/Ddem+ NSg/V . I/Ddem+ . . . NSg/I/C/Ddem+ . . . I/Ddem . . V/C
> " those " in different contexts with a lot of edge cases .
# . I/Ddem . NPr/J/P NSg/J NPl/V P D/P NPr/V P NSg/V+ NPl/V+ .
>
@@ -14,54 +14,54 @@
# NPl/V+
>
#
> This triangle is nice .
# I/Ddem+ NSg+ VL+ NPr/V/J+ .
> This is nice .
# I/Ddem+ VL+ NPr/V/J+ .
> That triangle is nice .
# NSg/I/C/Ddem+ NSg+ VL+ NPr/V/J+ .
> That is nice .
# NSg/I/C/Ddem+ VL+ NPr/V/J+ .
> These triangles are nice .
# I/Ddem+ NPl+ V+ NPr/V/J+ .
> These are nice .
# I/Ddem+ V+ NPr/V/J+ .
> Those triangles are nice .
# I/Ddem+ NPl+ V+ NPr/V/J+ .
> This triangle is nice .
# I/Ddem NSg VL NPr/V/J .
> This is nice .
# I/Ddem+ VL NPr/V/J .
> That triangle is nice .
# NSg/I/C/Ddem+ NSg VL NPr/V/J .
> That is nice .
# NSg/I/C/Ddem+ VL NPr/V/J .
> These triangles are nice .
# I/Ddem NPl V NPr/V/J .
> These are nice .
# I/Ddem+ V NPr/V/J .
> Those triangles are nice .
# I/Ddem NPl V NPr/V/J .
> Those are nice .
# I/Ddem+ V+ NPr/V/J .
# I/Ddem+ V NPr/V/J .
>
#
> This massage is nice .
# I/Ddem+ NSg/V+ VL+ NPr/V/J+ .
> That massage is nice .
# NSg/I/C/Ddem NSg/V+ VL+ NPr/V/J+ .
> These massages are nice .
# I/Ddem+ NPl/V+ V+ NPr/V/J+ .
> Those massages are nice .
# I/Ddem+ NPl/V+ V+ NPr/V/J+ .
> This massages well .
# I/Ddem+ NPl/V+ NSg/V/J+ .
> That massages well .
# NSg/I/C/Ddem+ NPl/V+ NSg/V/J+ .
> These massage well .
# I/Ddem+ NSg/V+ NSg/V/J+ .
> Those massage well .
# I/Ddem+ NSg/V+ NSg/V/J+ .
> This massage is nice .
# I/Ddem+ NSg/V+ VL NPr/V/J .
> That massage is nice .
# NSg/I/C/Ddem NSg/V+ VL NPr/V/J .
> These massages are nice .
# I/Ddem+ NPl/V+ V NPr/V/J .
> Those massages are nice .
# I/Ddem+ NPl/V+ V NPr/V/J .
> This massages well .
# I/Ddem+ NPl/V+ NSg/V/J .
> That massages well .
# NSg/I/C/Ddem+ NPl/V+ NSg/V/J .
> These massage well .
# I/Ddem+ NSg/V+ NSg/V/J .
> Those massage well .
# I/Ddem+ NSg/V+ NSg/V/J .
>
#
> That could be a solution .
# NSg/I/C/Ddem+ NSg/VX NSg/VX D/P NSg .
> Find all candidates that could be a solution .
# NSg/V NSg/I/J/C/Dq+ NPl/V+ NSg/I/C/Ddem+ NSg/VX NSg/VX D/P NSg+ .
> That could be a solution .
# NSg/I/C/Ddem+ NSg/VX NSg/VX D/P+ NSg+ .
> Find all candidates that could be a solution .
# NSg/V NSg/I/J/C/Dq+ NPl/V+ NSg/I/C/Ddem+ NSg/VX NSg/VX D/P+ NSg+ .
>
#
> This is all that I have .
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem ISg+ NSg/VX+ .
> This is all that solutions can do .
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem NPl+ NPr/VX+ NSg/VX .
> That solution can do .
# NSg/I/C/Ddem NSg+ NPr/VX+ NSg/VX .
> This is all that I have .
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem ISg+ NSg/VX .
> This is all that solutions can do .
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem NPl+ NPr/VX NSg/VX .
> That solution can do .
# NSg/I/C/Ddem NSg+ NPr/VX NSg/VX .
>
#
> We can do this !

View File

@@ -39,7 +39,7 @@ struct Args {
// Setting worker threads to four means the process will use about five threads total
// This is because worker threads do not include blocking threads
#[tokio::main(worker_threads = 4)]
#[tokio::main(worker_threads = 1)]
async fn main() -> anyhow::Result<()> {
let subscriber = FmtSubscriber::builder()
.map_writer(move |_| stderr)

View File

@@ -15,8 +15,13 @@ serde = { version = "1.0.219", features = ["derive"] }
is-macro = "0.3.7"
rayon = { version = "1.10.0", optional = true }
rand = { version = "0.9.1", optional = true }
burn = { version = "0.18.0", default-features = false, features = ["std"] }
burn-ndarray = { version = "0.18.0", default-features = false }
serde_json = "1.0.140"
itertools = "0.14.0"
lru = "0.16.0"
[features]
default = []
threaded = ["dep:rayon"]
training = ["dep:rand"]
training = ["dep:rand", "burn/train", "burn/autodiff"]

View File

@@ -0,0 +1,399 @@
use crate::{UPOS, chunker::Chunker};
#[cfg(feature = "training")]
use burn::backend::Autodiff;
#[cfg(feature = "training")]
use burn::nn::loss::{MseLoss, Reduction};
use burn::nn::{Dropout, DropoutConfig};
#[cfg(feature = "training")]
use burn::optim::{GradientsParams, Optimizer};
use burn::record::{FullPrecisionSettings, NamedMpkBytesRecorder, NamedMpkFileRecorder, Recorder};
use burn::tensor::TensorData;
#[cfg(feature = "training")]
use burn::tensor::backend::AutodiffBackend;
use burn::{
module::Module,
nn::{BiLstmConfig, EmbeddingConfig, LinearConfig},
tensor::{Int, Tensor, backend::Backend},
};
use burn_ndarray::{NdArray, NdArrayDevice};
use hashbrown::HashMap;
use std::path::Path;
const UNK_IDX: usize = 1;
#[derive(Module, Debug)]
struct NpModel<B: Backend> {
embedding_words: burn::nn::Embedding<B>,
embedding_upos: burn::nn::Embedding<B>,
lstm: burn::nn::BiLstm<B>,
linear_out: burn::nn::Linear<B>,
dropout: Dropout,
}
impl<B: Backend> NpModel<B> {
fn new(vocab: usize, word_embed_dim: usize, dropout: f32, device: &B::Device) -> Self {
let upos_embed = 8;
let total_embed = word_embed_dim + upos_embed;
Self {
embedding_words: EmbeddingConfig::new(vocab, word_embed_dim).init(device),
embedding_upos: EmbeddingConfig::new(20, upos_embed).init(device),
lstm: BiLstmConfig::new(total_embed, total_embed, false).init(device),
// Multiply by two because the BiLSTM emits double the hidden parameters
linear_out: LinearConfig::new(total_embed * 2, 1).init(device),
dropout: DropoutConfig::new(dropout as f64).init(),
}
}
fn forward(
&self,
word_tens: Tensor<B, 2, Int>,
tag_tens: Tensor<B, 2, Int>,
use_dropout: bool,
) -> Tensor<B, 2> {
let word_embed = self.embedding_words.forward(word_tens);
let tag_embed = self.embedding_upos.forward(tag_tens);
let mut x = Tensor::cat(vec![word_embed, tag_embed], 2);
if use_dropout {
x = self.dropout.forward(x);
}
let (mut x, _) = self.lstm.forward(x, None);
if use_dropout {
x = self.dropout.forward(x);
}
let x = self.linear_out.forward(x);
x.squeeze::<2>(2)
}
}
pub struct BurnChunker<B: Backend> {
vocab: HashMap<String, usize>,
model: NpModel<B>,
device: B::Device,
}
impl<B: Backend> BurnChunker<B> {
fn idx(&self, tok: &str) -> usize {
*self.vocab.get(tok).unwrap_or(&UNK_IDX)
}
fn to_tensors(
&self,
sent: &[String],
tags: &[Option<UPOS>],
) -> (Tensor<B, 2, Int>, Tensor<B, 2, Int>) {
// Interleave with UPOS tags
let idxs: Vec<_> = sent.iter().map(|t| self.idx(t) as i32).collect();
let upos: Vec<_> = tags
.iter()
.map(|t| t.map(|o| o as i32 + 2).unwrap_or(1))
.collect();
let word_tensor =
Tensor::<B, 1, Int>::from_data(TensorData::from(idxs.as_slice()), &self.device)
.reshape([1, sent.len()]);
let tag_tensor =
Tensor::<B, 1, Int>::from_data(TensorData::from(upos.as_slice()), &self.device)
.reshape([1, sent.len()]);
(word_tensor, tag_tensor)
}
pub fn save_to(&self, dir: impl AsRef<Path>) {
let dir = dir.as_ref();
std::fs::create_dir_all(dir).unwrap();
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::new();
self.model
.clone()
.save_file(dir.join("model.mpk"), &recorder)
.unwrap();
let vocab_bytes = serde_json::to_vec(&self.vocab).unwrap();
std::fs::write(dir.join("vocab.json"), vocab_bytes).unwrap();
}
pub fn load_from_bytes(
model_bytes: impl AsRef<[u8]>,
vocab_bytes: impl AsRef<[u8]>,
embed_dim: usize,
dropout: f32,
device: B::Device,
) -> Self {
let vocab: HashMap<String, usize> = serde_json::from_slice(vocab_bytes.as_ref()).unwrap();
let recorder = NamedMpkBytesRecorder::<FullPrecisionSettings>::new();
let owned_data = model_bytes.as_ref().to_vec();
let record = recorder.load(owned_data, &device).unwrap();
let model = NpModel::new(vocab.len(), embed_dim, dropout, &device);
let model = model.load_record(record);
Self {
vocab,
model,
device,
}
}
}
#[cfg(feature = "training")]
struct ExtractedSentences(
Vec<Vec<String>>,
Vec<Vec<Option<UPOS>>>,
Vec<Vec<bool>>,
HashMap<String, usize>,
);
#[cfg(feature = "training")]
impl<B: Backend + AutodiffBackend> BurnChunker<B> {
fn to_label(&self, labels: &[bool]) -> Tensor<B, 2> {
let ys: Vec<_> = labels.iter().map(|b| if *b { 1. } else { 0. }).collect();
Tensor::<B, 1, _>::from_data(TensorData::from(ys.as_slice()), &self.device)
.reshape([1, labels.len()])
}
pub fn train(
training_files: &[impl AsRef<Path>],
test_file: &impl AsRef<Path>,
word_embed_dim: usize,
dropout: f32,
epochs: usize,
lr: f64,
device: B::Device,
) -> Self {
use burn::tensor::cast::ToElement;
println!("Preparing datasets...");
let ExtractedSentences(sents, tags, labs, vocab) =
Self::extract_sents_from_files(training_files);
println!("Preparing model and training config...");
let mut model = NpModel::<B>::new(vocab.len(), word_embed_dim, dropout, &device);
let opt_config = burn::optim::AdamConfig::new();
let mut opt = opt_config.init();
let util = BurnChunker {
vocab: vocab.clone(),
model: model.clone(),
device: device.clone(),
};
let loss_fn = MseLoss::new();
let mut last_score = 0.;
println!("Training...");
for _ in 0..epochs {
let mut total_loss = 0.;
let mut total_tokens = 0;
let mut total_correct: usize = 0;
for (i, ((x, w), y)) in sents.iter().zip(tags.iter()).zip(labs.iter()).enumerate() {
let (word_tens, tag_tens) = util.to_tensors(x, w);
let y_tensor = util.to_label(y);
let logits = model.forward(word_tens, tag_tens, true);
total_correct += logits
.to_data()
.iter()
.map(|p: f32| p > 0.5)
.zip(y)
.map(|(a, b)| if a == *b { 1 } else { 0 })
.sum::<usize>();
let loss = loss_fn.forward(logits, y_tensor, Reduction::Mean);
let grads = loss.backward();
let grads = GradientsParams::from_grads(grads, &model);
model = opt.step(lr, model, grads);
total_loss += loss.into_scalar().to_f64();
total_tokens += x.len();
if i % 1000 == 0 {
println!("{i}/{}", sents.len());
}
}
println!(
"Average loss for epoch: {}",
total_loss / sents.len() as f64 * 100.
);
println!(
"{}% correct in training dataset",
total_correct as f32 / total_tokens as f32 * 100.
);
let score = util.score_model(&model, test_file);
println!("{}% correct in test dataset", score * 100.);
if score < last_score {
println!("Overfitting detected. Stopping...");
break;
}
last_score = score;
}
Self {
vocab,
model,
device,
}
}
fn score_model(&self, model: &NpModel<B>, dataset: &impl AsRef<Path>) -> f32 {
let ExtractedSentences(sents, tags, labs, _) = Self::extract_sents_from_files(&[dataset]);
let mut total_tokens = 0;
let mut total_correct: usize = 0;
for ((x, w), y) in sents.iter().zip(tags.iter()).zip(labs.iter()) {
let (word_tens, tag_tens) = self.to_tensors(x, w);
let logits = model.forward(word_tens, tag_tens, false);
total_correct += logits
.to_data()
.iter()
.map(|p: f32| p > 0.5)
.zip(y)
.map(|(a, b)| if a == *b { 1 } else { 0 })
.sum::<usize>();
total_tokens += x.len();
}
total_correct as f32 / total_tokens as f32
}
fn extract_sents_from_files(files: &[impl AsRef<Path>]) -> ExtractedSentences {
use super::np_extraction::locate_noun_phrases_in_sent;
use crate::conllu_utils::iter_sentences_in_conllu;
let mut vocab: HashMap<String, usize> = HashMap::new();
vocab.insert("<UNK>".into(), UNK_IDX);
let mut sents: Vec<Vec<String>> = Vec::new();
let mut sent_tags: Vec<Vec<Option<UPOS>>> = Vec::new();
let mut labs: Vec<Vec<bool>> = Vec::new();
const CONTRACTIONS: &[&str] = &["sn't", "n't", "'ll", "'ve", "'re", "'d", "'m", "'s"];
for file in files {
for sent in iter_sentences_in_conllu(file) {
let spans = locate_noun_phrases_in_sent(&sent);
let mut original_mask = vec![false; sent.tokens.len()];
for span in spans {
for i in span {
original_mask[i] = true;
}
}
let mut toks: Vec<String> = Vec::new();
let mut tags: Vec<Option<UPOS>> = Vec::new();
let mut mask: Vec<bool> = Vec::new();
for (idx, tok) in sent.tokens.iter().enumerate() {
let is_contraction = CONTRACTIONS.contains(&&tok.form[..]);
if is_contraction && !toks.is_empty() {
let prev_tok = toks.pop().unwrap();
let prev_mask = mask.pop().unwrap();
toks.push(format!("{prev_tok}{}", tok.form));
mask.push(prev_mask || original_mask[idx]);
} else {
toks.push(tok.form.clone());
tags.push(tok.upos.and_then(UPOS::from_conllu));
mask.push(original_mask[idx]);
}
}
for t in &toks {
if !vocab.contains_key(t) {
let next = vocab.len();
vocab.insert(t.clone(), next);
}
}
sents.push(toks);
sent_tags.push(tags);
labs.push(mask);
}
}
ExtractedSentences(sents, sent_tags, labs, vocab)
}
}
#[cfg(feature = "training")]
pub type BurnChunkerCpu = BurnChunker<burn::backend::Autodiff<NdArray>>;
#[cfg(not(feature = "training"))]
pub type BurnChunkerCpu = BurnChunker<NdArray>;
impl BurnChunkerCpu {
pub fn load_from_bytes_cpu(
model_bytes: impl AsRef<[u8]>,
vocab_bytes: impl AsRef<[u8]>,
embed_dim: usize,
dropout: f32,
) -> Self {
Self::load_from_bytes(
model_bytes,
vocab_bytes,
embed_dim,
dropout,
NdArrayDevice::Cpu,
)
}
}
#[cfg(feature = "training")]
impl BurnChunkerCpu {
pub fn train_cpu(
training_files: &[impl AsRef<Path>],
test_file: &impl AsRef<Path>,
embed_dim: usize,
dropout: f32,
epochs: usize,
lr: f64,
) -> Self {
BurnChunker::<Autodiff<NdArray>>::train(
training_files,
test_file,
embed_dim,
dropout,
epochs,
lr,
NdArrayDevice::Cpu,
)
}
}
impl<B: Backend> Chunker for BurnChunker<B> {
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
// Solves a divide-by-zero error in the linear layer.
if sentence.is_empty() {
return Vec::new();
}
let (word_tens, tag_tens) = self.to_tensors(sentence, tags);
let prob = self.model.forward(word_tens, tag_tens, false);
prob.to_data().iter().map(|p: f32| p > 0.5).collect()
}
}

View File

@@ -0,0 +1,61 @@
use lru::LruCache;
use std::hash::Hash;
use std::num::NonZeroUsize;
use std::sync::Mutex;
use super::Chunker;
use crate::UPOS;
/// Wraps any chunker implementation to add an LRU Cache.
/// Useful for incremental lints.
pub struct CachedChunker<C: Chunker> {
inner: C,
cache: Mutex<LruCache<CacheKey, Vec<bool>>>,
}
impl<C: Chunker> CachedChunker<C> {
pub fn new(inner: C, capacity: NonZeroUsize) -> Self {
Self {
inner,
cache: Mutex::new(LruCache::new(capacity)),
}
}
}
impl<C: Chunker> Chunker for CachedChunker<C> {
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
let key = CacheKey::new(sentence, tags);
// Attempt a cache hit.
// We put this in the block so `read` gets dropped as early as possible.
if let Ok(mut read) = self.cache.try_lock() {
if let Some(result) = read.get(&key) {
return result.clone();
}
};
// We don't want to hold the lock since it may take a while to run the chunker.
let result = self.inner.chunk_sentence(sentence, tags);
if let Ok(mut cache) = self.cache.try_lock() {
cache.put(key, result.clone());
}
result
}
}
#[derive(Hash, PartialEq, Eq)]
struct CacheKey {
sentence: Vec<String>,
tags: Vec<Option<UPOS>>,
}
impl CacheKey {
fn new(sentence: &[String], tags: &[Option<UPOS>]) -> Self {
Self {
sentence: sentence.to_vec(),
tags: tags.to_vec(),
}
}
}

View File

@@ -1,11 +1,15 @@
use crate::UPOS;
mod brill_chunker;
mod burn_chunker;
mod cached_chunker;
#[cfg(feature = "training")]
mod np_extraction;
mod upos_freq_dict;
pub use brill_chunker::BrillChunker;
pub use burn_chunker::{BurnChunker, BurnChunkerCpu};
pub use cached_chunker::CachedChunker;
pub use upos_freq_dict::UPOSFreqDict;
/// An implementer of this trait is capable of identifying the noun phrases in a provided sentence.

View File

@@ -7,6 +7,8 @@ mod upos;
#[cfg(feature = "training")]
mod word_counter;
pub use chunker::{BrillChunker, Chunker, UPOSFreqDict};
pub use chunker::{
BrillChunker, BurnChunker, BurnChunkerCpu, CachedChunker, Chunker, UPOSFreqDict,
};
pub use tagger::{BrillTagger, FreqDict, FreqDictBuilder, Tagger};
pub use upos::{UPOS, UPOSIter};

View File

@@ -18,4 +18,5 @@ once_cell = "1.21.3"
serde-wasm-bindgen = "0.6.5"
serde_json = "1.0.141"
serde = { version = "1.0.219", features = ["derive"] }
getrandom = { version = "0.3.3", default-features = false, features = ["wasm_js"] }
harper-stats = { path = "../harper-stats", version = "0.54.0", features = ["js"] }

View File

@@ -3,10 +3,11 @@ format:
cargo fmt
pnpm format
# Build the WebAssembly for a specific target (usually either `web` or `bundler`)
# Build the WebAssembly module
build-wasm:
cd "{{justfile_directory()}}/harper-wasm" && wasm-pack build --target web
#!/usr/bin/env bash
cd "{{justfile_directory()}}/harper-wasm"
RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack build --target web
# Build `harper.js` with all size optimizations available.
build-harperjs: build-wasm
@@ -590,6 +591,9 @@ newest-dict-changes *numCommits:
});
});
getnps a:
cargo run --bin harper-cli -- nominal-phrases "{{a}}"
# Suggest annotations for a potential new property annotation
suggestannotation input:
#! /usr/bin/env node
@@ -630,4 +634,4 @@ suggestannotation input:
} else {
console.log(`None of the characters of "${input}" are available to use for new annotations, and none of them are OK to be moved to make way for new annotations.`);
}
}
}

View File

@@ -35,16 +35,16 @@ chrome.runtime.onInstalled.addListener((details) => {
}
});
let linter: LocalLinter;
getDialect().then(setDialect);
chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
handleRequest(request).then(sendResponse);
return true;
});
let linter: LocalLinter;
getDialect().then(setDialect);
async function enableDefaultDomains() {
const defaultEnabledDomains = [
'chatgpt.com',
@@ -93,6 +93,8 @@ async function enableDefaultDomains() {
enableDefaultDomains();
function handleRequest(message: Request): Promise<Response> {
console.log(`Handling ${message.kind} request`);
switch (message.kind) {
case 'lint':
return handleLint(message);

View File

@@ -2,9 +2,18 @@ import '@webcomponents/custom-elements';
import $ from 'jquery';
import { isVisible, leafNodes } from '../domUtils';
import LintFramework from '../LintFramework';
import ProtocolClient from '../ProtocolClient';
const fw = new LintFramework();
const keepAliveCallback = () => {
ProtocolClient.lint('', 'example.com');
setTimeout(keepAliveCallback, 400);
};
keepAliveCallback();
function scan() {
$('textarea:visible').each(function () {
if (this.getAttribute('data-enable-grammarly') == 'false' || this.disabled || this.readOnly) {

View File

@@ -2,4 +2,11 @@ import path from 'path';
import { createFixture } from 'playwright-webextext';
const pathToExtension = path.join(import.meta.dirname, '../build');
export const { test, expect } = createFixture(pathToExtension);
const { test, expect } = createFixture(pathToExtension);
test.afterEach(async ({ context }) => {
const bg = context.serviceWorkers()[0] ?? context.backgroundPages()[0];
if (bg) await bg.evaluate(() => chrome?.storage?.local.clear?.());
});
export { test, expect };

View File

@@ -17,6 +17,9 @@ testCanIgnoreTextareaSuggestion(TEST_PAGE_URL);
test('Wraps correctly', async ({ page }) => {
await page.goto(TEST_PAGE_URL);
await page.waitForTimeout(2000);
await page.reload();
const editor = getTextarea(page);
await replaceEditorContent(
editor,
@@ -34,6 +37,9 @@ test('Wraps correctly', async ({ page }) => {
test('Scrolls correctly', async ({ page }) => {
await page.goto(TEST_PAGE_URL);
await page.waitForTimeout(2000);
await page.reload();
const editor = getTextarea(page);
await replaceEditorContent(
editor,

View File

@@ -1,5 +1,10 @@
import { expect, test } from './fixtures';
import { clickHarperHighlight, getLexicalEditor, replaceEditorContent } from './testUtils';
import {
clickHarperHighlight,
getLexicalEditor,
randomString,
replaceEditorContent,
} from './testUtils';
const TEST_PAGE_URL = 'https://playground.lexical.dev/';
@@ -27,7 +32,8 @@ test('Can ignore suggestion.', async ({ page }) => {
await page.goto(TEST_PAGE_URL);
const lexical = getLexicalEditor(page);
await replaceEditorContent(lexical, 'This is an test.');
const cacheSalt = randomString(5);
await replaceEditorContent(lexical, cacheSalt);
await page.waitForTimeout(3000);
@@ -37,6 +43,6 @@ test('Can ignore suggestion.', async ({ page }) => {
await page.waitForTimeout(3000);
// Nothing should change.
expect(lexical).toContainText('This is an test');
expect(lexical).toContainText(cacheSalt);
expect(await clickHarperHighlight(page)).toBe(false);
});

View File

@@ -1,5 +1,10 @@
import { expect, test } from './fixtures';
import { clickHarperHighlight, getProseMirrorEditor, replaceEditorContent } from './testUtils';
import {
clickHarperHighlight,
getProseMirrorEditor,
randomString,
replaceEditorContent,
} from './testUtils';
const TEST_PAGE_URL = 'https://prosemirror.net/';
@@ -27,7 +32,8 @@ test('Can ignore suggestion.', async ({ page }) => {
await page.goto(TEST_PAGE_URL);
const pm = getProseMirrorEditor(page);
await replaceEditorContent(pm, 'This is an test.');
const cacheSalt = randomString(5);
await replaceEditorContent(pm, cacheSalt);
await page.waitForTimeout(3000);
@@ -37,6 +43,6 @@ test('Can ignore suggestion.', async ({ page }) => {
await page.waitForTimeout(3000);
// Nothing should change.
expect(pm).toContainText('This is an test');
expect(pm).toContainText(cacheSalt);
expect(await clickHarperHighlight(page)).toBe(false);
});

View File

@@ -1,5 +1,10 @@
import { expect, test } from './fixtures';
import { clickHarperHighlight, getSlateEditor, replaceEditorContent } from './testUtils';
import {
clickHarperHighlight,
getSlateEditor,
randomString,
replaceEditorContent,
} from './testUtils';
const TEST_PAGE_URL = 'https://slatejs.org';
@@ -27,7 +32,8 @@ test('Can ignore suggestion.', async ({ page }) => {
await page.goto(TEST_PAGE_URL);
const slate = getSlateEditor(page);
await replaceEditorContent(slate, 'This is an test.');
const cacheSalt = randomString(5);
await replaceEditorContent(slate, cacheSalt);
await page.waitForTimeout(3000);
@@ -37,6 +43,6 @@ test('Can ignore suggestion.', async ({ page }) => {
await page.waitForTimeout(3000);
// Nothing should change.
expect(slate).toContainText('This is an test');
expect(slate).toContainText(cacheSalt);
expect(await clickHarperHighlight(page)).toBe(false);
});

View File

@@ -2,6 +2,15 @@ import type { Locator, Page } from '@playwright/test';
import type { Box } from '../src/Box';
import { expect, test } from './fixtures';
export function randomString(length: number): string {
const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
let result = '';
for (let i = 0; i < length; i++) {
result += chars.charAt(Math.floor(Math.random() * chars.length));
}
return result;
}
/** Locate the [`Slate`](https://www.slatejs.org/examples/richtext) editor on the page. */
export function getSlateEditor(page: Page): Locator {
return page.locator('[data-slate-editor="true"]');
@@ -59,9 +68,12 @@ export function getTextarea(page: Page): Locator {
}
export async function testBasicSuggestionTextarea(testPageUrl: string) {
test('Can apply basic suggestion.', async ({ page }) => {
test('Can apply basic suggestion.', async ({ page, context }) => {
await page.goto(testPageUrl);
await page.waitForTimeout(2000);
await page.reload();
const editor = getTextarea(page);
await replaceEditorContent(editor, 'This is an test');
@@ -80,8 +92,13 @@ export async function testCanIgnoreTextareaSuggestion(testPageUrl: string) {
test('Can ignore suggestion.', async ({ page }) => {
await page.goto(testPageUrl);
await page.waitForTimeout(2000);
await page.reload();
const editor = getTextarea(page);
await replaceEditorContent(editor, 'This is an test');
const cacheSalt = randomString(5);
await replaceEditorContent(editor, cacheSalt);
await page.waitForTimeout(6000);
@@ -91,7 +108,7 @@ export async function testCanIgnoreTextareaSuggestion(testPageUrl: string) {
await page.waitForTimeout(3000);
// Nothing should change.
expect(editor).toHaveValue('This is an test');
expect(editor).toHaveValue(cacheSalt);
expect(await clickHarperHighlight(page)).toBe(false);
});
}