mirror of
https://github.com/Automattic/harper.git
synced 2025-12-05 19:26:55 -06:00
feat(chunker): build new chunker with Burn (#1579)
This commit is contained in:
2731
Cargo.lock
generated
2731
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -18,4 +18,4 @@ opt-level = 3
|
||||
# Useful for debugging and profiling.
|
||||
[profile.release-debug]
|
||||
inherits = "release"
|
||||
debug = 2
|
||||
debug = 2
|
||||
@@ -11,7 +11,7 @@ RUN cargo install wasm-pack
|
||||
COPY . .
|
||||
|
||||
WORKDIR /usr/build/harper-wasm
|
||||
RUN wasm-pack build --release --target web
|
||||
RUN RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack build --target web
|
||||
|
||||
FROM node:${NODE_VERSION} AS node-build
|
||||
|
||||
|
||||
BIN
harper-brill/finished_chunker/model.mpk
Normal file
BIN
harper-brill/finished_chunker/model.mpk
Normal file
Binary file not shown.
32962
harper-brill/finished_chunker/vocab.json
Normal file
32962
harper-brill/finished_chunker/vocab.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,7 @@
|
||||
use harper_pos_utils::{BurnChunkerCpu, CachedChunker};
|
||||
use lazy_static::lazy_static;
|
||||
use std::num::NonZero;
|
||||
use std::rc::Rc;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use harper_pos_utils::{BrillChunker, BrillTagger, Chunker, FreqDict, Tagger, UPOS};
|
||||
@@ -30,3 +33,21 @@ fn uncached_brill_chunker() -> BrillChunker {
|
||||
pub fn brill_chunker() -> Arc<BrillChunker> {
|
||||
(*BRILL_CHUNKER).clone()
|
||||
}
|
||||
|
||||
const BURN_CHUNKER_VOCAB: &[u8; 627993] = include_bytes!("../finished_chunker/vocab.json");
|
||||
const BURN_CHUNKER_BIN: &[u8; 806312] = include_bytes!("../finished_chunker/model.mpk");
|
||||
|
||||
thread_local! {
|
||||
static BURN_CHUNKER: Rc<CachedChunker<BurnChunkerCpu>> = Rc::new(uncached_burn_chunker());
|
||||
}
|
||||
|
||||
fn uncached_burn_chunker() -> CachedChunker<BurnChunkerCpu> {
|
||||
CachedChunker::new(
|
||||
BurnChunkerCpu::load_from_bytes_cpu(BURN_CHUNKER_BIN, BURN_CHUNKER_VOCAB, 6, 0.3),
|
||||
NonZero::new(10000).unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn burn_chunker() -> Rc<CachedChunker<BurnChunkerCpu>> {
|
||||
(BURN_CHUNKER).with(|c| c.clone())
|
||||
}
|
||||
|
||||
@@ -24,3 +24,4 @@ strum_macros = "0.27.2"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
training = ["harper-pos-utils/training"]
|
||||
|
||||
@@ -21,7 +21,9 @@ use harper_core::{
|
||||
word_metadata_orthography::OrthFlags,
|
||||
};
|
||||
use harper_literate_haskell::LiterateHaskellParser;
|
||||
use harper_pos_utils::{BrillChunker, BrillTagger};
|
||||
#[cfg(feature = "training")]
|
||||
use harper_pos_utils::{BrillChunker, BrillTagger, BurnChunkerCpu};
|
||||
|
||||
use harper_stats::Stats;
|
||||
use serde::Serialize;
|
||||
|
||||
@@ -101,6 +103,7 @@ enum Args {
|
||||
/// The document to mine words from.
|
||||
file: PathBuf,
|
||||
},
|
||||
#[cfg(feature = "training")]
|
||||
TrainBrillTagger {
|
||||
#[arg(short, long, default_value = "1.0")]
|
||||
candidate_selection_chance: f32,
|
||||
@@ -112,6 +115,7 @@ enum Args {
|
||||
#[arg(num_args = 1..)]
|
||||
datasets: Vec<PathBuf>,
|
||||
},
|
||||
#[cfg(feature = "training")]
|
||||
TrainBrillChunker {
|
||||
#[arg(short, long, default_value = "1.0")]
|
||||
candidate_selection_chance: f32,
|
||||
@@ -123,6 +127,27 @@ enum Args {
|
||||
#[arg(num_args = 1..)]
|
||||
datasets: Vec<PathBuf>,
|
||||
},
|
||||
#[cfg(feature = "training")]
|
||||
TrainBurnChunker {
|
||||
#[arg(short, long)]
|
||||
lr: f64,
|
||||
// The number of embedding dimensions
|
||||
#[arg(long)]
|
||||
dim: usize,
|
||||
/// The path to write the final model file to.
|
||||
#[arg(short, long)]
|
||||
output: PathBuf,
|
||||
/// The number of epochs to train.
|
||||
#[arg(short, long)]
|
||||
epochs: usize,
|
||||
/// The dropout probability
|
||||
#[arg(long)]
|
||||
dropout: f32,
|
||||
#[arg(short, long)]
|
||||
test_file: PathBuf,
|
||||
#[arg(num_args = 1..)]
|
||||
datasets: Vec<PathBuf>,
|
||||
},
|
||||
/// Print harper-core version.
|
||||
CoreVersion,
|
||||
/// Rename a flag in the dictionary and affixes.
|
||||
@@ -476,6 +501,7 @@ fn main() -> anyhow::Result<()> {
|
||||
println!("harper-core v{}", harper_core::core_version());
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "training")]
|
||||
Args::TrainBrillTagger {
|
||||
datasets: dataset,
|
||||
epochs,
|
||||
@@ -487,6 +513,7 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "training")]
|
||||
Args::TrainBrillChunker {
|
||||
datasets,
|
||||
epochs,
|
||||
@@ -497,6 +524,22 @@ fn main() -> anyhow::Result<()> {
|
||||
fs::write(output, serde_json::to_string_pretty(&chunker)?)?;
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "training")]
|
||||
Args::TrainBurnChunker {
|
||||
datasets,
|
||||
test_file,
|
||||
epochs,
|
||||
dropout,
|
||||
output,
|
||||
lr,
|
||||
dim: embed_dim,
|
||||
} => {
|
||||
let chunker =
|
||||
BurnChunkerCpu::train_cpu(&datasets, &test_file, embed_dim, dropout, epochs, lr);
|
||||
chunker.save_to(output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Args::RenameFlag { old, new, dir } => {
|
||||
use serde_json::Value;
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::cmp::Ordering;
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt::Display;
|
||||
|
||||
use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
|
||||
use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
|
||||
use paste::paste;
|
||||
|
||||
use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
|
||||
@@ -140,33 +140,37 @@ impl Document {
|
||||
self.condense_filename_extensions();
|
||||
self.match_quotes();
|
||||
|
||||
let token_strings: Vec<_> = self
|
||||
.tokens
|
||||
.iter()
|
||||
.filter(|t| !t.kind.is_whitespace())
|
||||
.map(|t| self.get_span_content_str(&t.span))
|
||||
.collect();
|
||||
let chunker = burn_chunker();
|
||||
let tagger = brill_tagger();
|
||||
|
||||
let token_tags = brill_tagger().tag_sentence(&token_strings);
|
||||
let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
|
||||
for sent in self.tokens.iter_sentences_mut() {
|
||||
let token_strings: Vec<_> = sent
|
||||
.iter()
|
||||
.filter(|t| !t.kind.is_whitespace())
|
||||
.map(|t| t.span.get_content_string(&self.source))
|
||||
.collect();
|
||||
|
||||
let mut i = 0;
|
||||
let token_tags = tagger.tag_sentence(&token_strings);
|
||||
let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
|
||||
|
||||
// Annotate word metadata
|
||||
for token in self.tokens.iter_mut() {
|
||||
if let TokenKind::Word(meta) = &mut token.kind {
|
||||
let word_source = token.span.get_content(&self.source);
|
||||
let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
|
||||
let mut i = 0;
|
||||
|
||||
if let Some(inner) = &mut found_meta {
|
||||
inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
|
||||
inner.np_member = Some(np_flags[i]);
|
||||
// Annotate word metadata
|
||||
for token in sent.iter_mut() {
|
||||
if let TokenKind::Word(meta) = &mut token.kind {
|
||||
let word_source = token.span.get_content(&self.source);
|
||||
let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
|
||||
|
||||
if let Some(inner) = &mut found_meta {
|
||||
inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
|
||||
inner.np_member = Some(np_flags[i]);
|
||||
}
|
||||
|
||||
*meta = found_meta;
|
||||
i += 1;
|
||||
} else if !token.kind.is_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
*meta = found_meta;
|
||||
i += 1;
|
||||
} else if !token.kind.is_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -745,6 +749,10 @@ impl TokenStringExt for Document {
|
||||
fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
|
||||
self.tokens.iter_sentences()
|
||||
}
|
||||
|
||||
fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
|
||||
self.tokens.iter_sentences_mut()
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Document {
|
||||
|
||||
@@ -99,6 +99,10 @@ pub trait TokenStringExt {
|
||||
/// Get an iterator over token slices that represent the individual
|
||||
/// sentences in a document.
|
||||
fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
|
||||
|
||||
/// Get an iterator over mutable token slices that represent the individual
|
||||
/// sentences in a document.
|
||||
fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
|
||||
}
|
||||
|
||||
impl TokenStringExt for [Token] {
|
||||
@@ -239,4 +243,32 @@ impl TokenStringExt for [Token] {
|
||||
|
||||
first_sentence.into_iter().chain(rest).chain(last_sentence)
|
||||
}
|
||||
|
||||
fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
|
||||
struct SentIter<'a> {
|
||||
rem: &'a mut [Token],
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SentIter<'a> {
|
||||
type Item = &'a mut [Token];
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.rem.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let split = self
|
||||
.rem
|
||||
.iter()
|
||||
.position(|t| t.kind.is_sentence_terminator())
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(self.rem.len());
|
||||
let tmp = core::mem::take(&mut self.rem);
|
||||
let (sent, rest) = tmp.split_at_mut(split);
|
||||
self.rem = rest;
|
||||
Some(sent)
|
||||
}
|
||||
}
|
||||
|
||||
SentIter { rem: self }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use itertools::Itertools;
|
||||
use paste::paste;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
use strum::{EnumCount, VariantArray};
|
||||
use strum::{EnumCount as _, VariantArray as _};
|
||||
use strum_macros::{Display, EnumCount, EnumString, VariantArray};
|
||||
|
||||
use std::convert::TryFrom;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -6,68 +6,68 @@
|
||||
# Unlintable Unlintable
|
||||
> -->
|
||||
# Unlintable Unlintable
|
||||
> Part - of - speech tagging
|
||||
# Unlintable NSg/V/J . P . N🅪Sg/V+ NSg/V
|
||||
> Part - of - speech tagging
|
||||
# Unlintable NSg/V/J+ . P . N🅪Sg/V+ NSg/V
|
||||
>
|
||||
#
|
||||
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
|
||||
# NPr/J/P NSg+ NᴹSg . NSg/V/J . P . N🅪Sg/V NSg/V . NSg+ NSg/V NPr/C NSg+ NSg/V NPr/C
|
||||
> POST ) , also called grammatical tagging is the process of marking up a word in a
|
||||
# NPr🅪/V/P+ . . W? V/J J NSg/V VL D NSg/V P NSg/V NSg/V/J/P D/P NSg/V NPr/J/P D/P
|
||||
> text ( corpus ) as corresponding to a particular part of speech , based on both its
|
||||
# N🅪Sg/V . NSg+ . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P N🅪Sg/V+ . V/J J/P I/C/Dq ISg/D$+
|
||||
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
|
||||
# NPr/J/P NSg+ NᴹSg+ . NSg/V/J+ . P . N🅪Sg/V+ NSg/V . NSg+ NSg/V NPr/C NSg+ NSg/V NPr/C
|
||||
> POST ) , also called grammatical tagging is the process of marking up a word in a
|
||||
# NPr🅪/V/P+ . . W? V/J J NSg/V VL D NSg/V P NSg/V NSg/V/J/P D/P NSg/V+ NPr/J/P D/P
|
||||
> text ( corpus ) as corresponding to a particular part of speech , based on both its
|
||||
# N🅪Sg/V+ . NSg+ . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P N🅪Sg/V+ . V/J J/P I/C/Dq ISg/D$+
|
||||
> definition and its context . A simplified form of this is commonly taught to
|
||||
# NSg V/C ISg/D$+ N🅪Sg/V+ . D/P V/J NSg/V P I/Ddem+ VL R V P
|
||||
> school - age children , in the identification of words as nouns , verbs , adjectives ,
|
||||
# NSg/V . N🅪Sg/V NPl . NPr/J/P D NSg P NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V .
|
||||
> school - age children , in the identification of words as nouns , verbs , adjectives ,
|
||||
# NSg/V . N🅪Sg/V+ NPl+ . NPr/J/P D NSg P NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V .
|
||||
> adverbs , etc.
|
||||
# NPl/V . W?
|
||||
# NPl/V . +
|
||||
>
|
||||
#
|
||||
> Once performed by hand , POS tagging is now done in the context of computational
|
||||
# NSg/C V/J NSg/J/P NSg/V+ . NSg+ NSg/V VL NPr/V/J/C NSg/V/J NPr/J/P D N🅪Sg/V P J+
|
||||
# NSg/C V/J NSg/J/P NSg/V+ . NSg+ NSg/V VL NPr/V/J/C NSg/V/J NPr/J/P D N🅪Sg/V P J
|
||||
> linguistics , using algorithms which associate discrete terms , as well as hidden
|
||||
# NᴹSg+ . V NPl+ I/C+ NSg/V/J+ J NPl/V+ . NSg/R NSg/V/J NSg/R V/J
|
||||
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
|
||||
# NPl/V P N🅪Sg/V+ . NSg/J/P D/P NPr/V/J P NSg/J+ NPl/V+ . NSg+ . NSg/V NPl NSg/V P
|
||||
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
|
||||
# NSg NSg/J NPl/V+ . NSg/V+ . V/J+ V/C+ J+ . ? ? NSg . NSg/I/V/J P D
|
||||
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
|
||||
# NPl/V P N🅪Sg/V+ . NSg/J/P D/P NPr/V/J P NSg/J NPl/V+ . NSg+ . NSg/V NPl+ NSg/V+ P
|
||||
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
|
||||
# NSg NSg/J NPl/V+ . NSg/V+ . V/J V/C J . ? ? NSg . NSg/I/V/J P D
|
||||
> first and most widely used English POS - taggers , employs rule - based algorithms .
|
||||
# NSg/V/J V/C NSg/I/J/Dq R V/J NPr🅪/V/J+ NSg+ . NPl . NPl/V NSg/V+ . V/J NPl+ .
|
||||
>
|
||||
#
|
||||
> Principle
|
||||
# N🅪Sg/V
|
||||
# N🅪Sg/V+
|
||||
>
|
||||
#
|
||||
> Part - of - speech tagging is harder than just having a list of words and their
|
||||
# NSg/V/J . P . N🅪Sg/V NSg/V VL JC C/P V/J V D/P NSg/V P NPl/V V/C D$+
|
||||
> parts of speech , because some words can represent more than one part of speech
|
||||
# NPl/V P N🅪Sg/V+ . C/P I/J/R/Dq+ NPl/V+ NPr/VX V NPr/I/V/J/Dq C/P NSg/I/V/J NSg/V/J P N🅪Sg/V+
|
||||
> at different times , and because some parts of speech are complex . This is not
|
||||
# NSg/P NSg/J+ NPl/V+ . V/C C/P I/J/R/Dq NPl/V P N🅪Sg/V+ V+ NSg/V/J+ . I/Ddem+ VL NSg/C
|
||||
> rare — in natural languages ( as opposed to many artificial languages ) , a large
|
||||
# NSg/V/J . NPr/J/P NSg/J NPl/V+ . NSg/R V/J P NSg/I/J/Dq J NPl/V+ . . D/P NSg/J
|
||||
> Part - of - speech tagging is harder than just having a list of words and their
|
||||
# NSg/V/J+ . P . N🅪Sg/V+ NSg/V VL JC C/P V/J V D/P NSg/V P NPl/V+ V/C D$+
|
||||
> parts of speech , because some words can represent more than one part of speech
|
||||
# NPl/V P N🅪Sg/V+ . C/P I/J/R/Dq NPl/V+ NPr/VX V NPr/I/V/J/Dq C/P NSg/I/V/J NSg/V/J P N🅪Sg/V+
|
||||
> at different times , and because some parts of speech are complex . This is not
|
||||
# NSg/P NSg/J NPl/V+ . V/C C/P I/J/R/Dq NPl/V P N🅪Sg/V+ V NSg/V/J . I/Ddem+ VL NSg/C
|
||||
> rare — in natural languages ( as opposed to many artificial languages ) , a large
|
||||
# NSg/V/J . NPr/J/P NSg/J+ NPl/V+ . NSg/R V/J P NSg/I/J/Dq+ J+ NPl/V+ . . D/P NSg/J
|
||||
> percentage of word - forms are ambiguous . For example , even " dogs " , which is
|
||||
# NSg P NSg/V+ . NPl/V+ V+ J+ . C/P NSg/V+ . NSg/V/J . NPl/V+ . . I/C+ VL
|
||||
> usually thought of as just a plural noun , can also be a verb :
|
||||
# R NSg/V P NSg/R V/J D/P+ NSg/J+ NSg/V+ . NPr/VX W? NSg/VX D/P NSg/V+ .
|
||||
# NSg P NSg/V+ . NPl/V+ V J . C/P NSg/V+ . NSg/V/J . NPl/V+ . . I/C+ VL
|
||||
> usually thought of as just a plural noun , can also be a verb :
|
||||
# R NSg/V P NSg/R V/J D/P+ NSg/J+ NSg/V+ . NPr/VX W? NSg/VX D/P+ NSg/V+ .
|
||||
>
|
||||
#
|
||||
> The sailor dogs the hatch .
|
||||
# D+ NSg NPl/V D NSg/V+ .
|
||||
> The sailor dogs the hatch .
|
||||
# D+ NSg+ NPl/V+ D+ NSg/V+ .
|
||||
>
|
||||
#
|
||||
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
|
||||
# NSg/V/J+ J NSg/V NPr/VX V NSg/I/C/Ddem+ . NPl/V+ . VL NSg/J/R V/J NSg/R D/P+ NSg/V+ . NSg/C
|
||||
> as the more common plural noun . Grammatical context is one way to determine
|
||||
# NSg/R D NPr/I/V/J/Dq NSg/V/J NSg/J NSg/V+ . J N🅪Sg/V+ VL NSg/I/V/J NSg/J+ P V
|
||||
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
|
||||
# NSg/V/J J NSg/V NPr/VX V NSg/I/C/Ddem+ . NPl/V+ . VL NSg/J/R V/J NSg/R D/P NSg/V+ . NSg/C
|
||||
> as the more common plural noun . Grammatical context is one way to determine
|
||||
# NSg/R D NPr/I/V/J/Dq NSg/V/J NSg/J NSg/V+ . J+ N🅪Sg/V+ VL NSg/I/V/J NSg/J P V
|
||||
> this ; semantic analysis can also be used to infer that " sailor " and " hatch "
|
||||
# I/Ddem+ . NSg/J N🅪Sg+ NPr/VX W? NSg/VX V/J P V NSg/I/C/Ddem+ . NSg+ . V/C . NSg/V .
|
||||
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
|
||||
# NSg/V . NPl/V . NSg/R # . NPr/J/P D+ J+ N🅪Sg/V+ V/C # . D/P NSg/V/J+ V/J P D
|
||||
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
|
||||
# NSg/V+ . NSg/V . . NPr/J/P I/Ddem+ N🅪Sg/V+ . . NPl/V+ . VL D/P J NSg/V/J+ N🅪Sg/V/J+ . V . D/P
|
||||
# I/Ddem+ . NSg/J+ N🅪Sg+ NPr/VX W? NSg/VX V/J P V NSg/I/C/Ddem+ . NSg+ . V/C . NSg/V .
|
||||
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
|
||||
# NSg/V . NPl/V+ . NSg/R # . NPr/J/P D J N🅪Sg/V+ V/C # . D/P NSg/V/J+ V/J P D
|
||||
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
|
||||
# NSg/V+ . NSg/V . . NPr/J/P I/Ddem N🅪Sg/V+ . . NPl/V+ . VL D/P J NSg/V/J+ N🅪Sg/V/J+ . V . D/P
|
||||
> watertight door ) securely " ) .
|
||||
# J NSg/V+ . R . . .
|
||||
>
|
||||
@@ -76,358 +76,358 @@
|
||||
# NSg/V+ NPl/V
|
||||
>
|
||||
#
|
||||
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
|
||||
# NPl/V+ R NSg/V NSg/I/C/Ddem + V # NPl/V P N🅪Sg/V+ NPr/J/P NPr🅪/V/J . NSg/V+ . NSg/V+ .
|
||||
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
|
||||
# NPl/V+ R NSg/V NSg/I/C/Ddem + V # NPl/V P N🅪Sg/V NPr/J/P NPr🅪/V/J . NSg/V+ . NSg/V+ .
|
||||
> article , adjective , preposition , pronoun , adverb , conjunction , and interjection .
|
||||
# NSg/V+ . NSg/V/J+ . NSg/V . NSg/V+ . NSg/V+ . NSg/V+ . V/C NSg+ .
|
||||
> However , there are clearly many more categories and sub - categories . For nouns ,
|
||||
# C . + V R NSg/I/J/Dq NPr/I/V/J/Dq NPl+ V/C NSg/V/P . NPl . C/P NPl/V .
|
||||
> the plural , possessive , and singular forms can be distinguished . In many
|
||||
# D NSg/J . NSg/J . V/C NSg/J NPl/V+ NPr/VX+ NSg/VX+ V/J+ . NPr/J/P NSg/I/J/Dq+
|
||||
> languages words are also marked for their " case " ( role as subject , object ,
|
||||
# NPl/V+ NPl/V+ V W? V/J C/P D$+ . NPr/V+ . . NSg NSg/R NSg/V/J . NSg/V+ .
|
||||
> However , there are clearly many more categories and sub - categories . For nouns ,
|
||||
# C . + V R NSg/I/J/Dq+ NPr/I/V/J/Dq+ NPl+ V/C NSg/V/P . NPl+ . C/P NPl/V .
|
||||
> the plural , possessive , and singular forms can be distinguished . In many
|
||||
# D NSg/J . NSg/J . V/C NSg/J NPl/V+ NPr/VX NSg/VX V/J . NPr/J/P NSg/I/J/Dq+
|
||||
> languages words are also marked for their " case " ( role as subject , object ,
|
||||
# NPl/V+ NPl/V+ V W? V/J C/P D$+ . NPr/V+ . . NSg NSg/R NSg/V/J+ . NSg/V+ .
|
||||
> etc. ) , grammatical gender , and so on ; while verbs are marked for tense , aspect ,
|
||||
# + . . J+ NSg/V/J+ . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V+ V V/J C/P NSg/V/J . NSg/V+ .
|
||||
> and other things . In some tagging systems , different inflections of the same
|
||||
# V/C NSg/V/J+ NPl/V+ . NPr/J/P I/J/R/Dq+ NSg/V NPl+ . NSg/J NPl P D+ I/J+
|
||||
> and other things . In some tagging systems , different inflections of the same
|
||||
# V/C NSg/V/J+ NPl/V+ . NPr/J/P I/J/R/Dq NSg/V NPl+ . NSg/J NPl P D I/J
|
||||
> root word will get different parts of speech , resulting in a large number of
|
||||
# NPr/V+ NSg/V+ NPr/VX NSg/V NSg/J NPl/V P N🅪Sg/V+ . V NPr/J/P D/P NSg/J NSg/V/JC P+
|
||||
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
|
||||
# NPl/V+ . C/P NSg/V+ . ? C/P NSg/J NSg/V/J+ NPl/V . ? C/P NSg/J NSg/V/J+ NPl/V . NPr
|
||||
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
|
||||
# C/P NSg/J NSg/J NPl/V . NSg/V D+ NSg+ NPl/V+ V/J NPr/J/P D+ NPr/V/J+ NSg+ . . NSg/V/J
|
||||
# NPr/V+ NSg/V+ NPr/VX NSg/V NSg/J NPl/V P N🅪Sg/V+ . V NPr/J/P D/P NSg/J NSg/V/JC P
|
||||
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
|
||||
# NPl/V+ . C/P NSg/V+ . ? C/P NSg/J NSg/V/J NPl/V . ? C/P NSg/J NSg/V/J NPl/V . NPr
|
||||
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
|
||||
# C/P NSg/J NSg/J NPl/V . NSg/V D NSg+ NPl/V+ V/J NPr/J/P D NPr/V/J NSg+ . . NSg/V/J
|
||||
> tagging systems use a smaller number of tags and ignore fine differences or
|
||||
# NSg/V NPl+ NSg/V D/P NSg/JC NSg/V/JC P NPl/V+ V/C V NSg/V/J NSg/V NPr/C
|
||||
> model them as features somewhat independent from part - of - speech .
|
||||
# NSg/V/J+ NSg/IPl+ NSg/R+ NPl/V+ NSg/I NSg/J P NSg/V/J . P . N🅪Sg/V+ .
|
||||
> model them as features somewhat independent from part - of - speech .
|
||||
# NSg/V/J+ NSg/IPl+ NSg/R NPl/V+ NSg/I NSg/J P NSg/V/J+ . P . N🅪Sg/V+ .
|
||||
>
|
||||
#
|
||||
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
|
||||
# NPr/J/P NSg/V/J . P . N🅪Sg/V NSg/V NSg/J/P NSg/V+ . NPr/ISg+ VL NSg/J P V P # P
|
||||
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
|
||||
# NPr/J/P NSg/V/J+ . P . N🅪Sg/V+ NSg/V NSg/J/P NSg/V+ . NPr/ISg+ VL NSg/J P V P # P
|
||||
> 150 separate parts of speech for English . Work on stochastic methods for tagging
|
||||
# # NSg/V/J NPl/V P N🅪Sg/V C/P NPr🅪/V/J+ . NSg/V J/P J NPl/V C/P NSg/V
|
||||
# # NSg/V/J NPl/V P N🅪Sg/V C/P NPr🅪/V/J+ . NSg/V J/P J NPl/V+ C/P NSg/V
|
||||
> Koine Greek ( DeRose 1990 ) has used over 1 , 000 parts of speech and found that
|
||||
# ? NPr/V/J . ? # . V V/J NSg/V/J/P # . # NPl/V P N🅪Sg/V+ V/C NSg/V NSg/I/C/Ddem
|
||||
> about as many words were ambiguous in that language as in English . A
|
||||
# J/P NSg/R NSg/I/J/Dq+ NPl/V+ NSg/V J NPr/J/P NSg/I/C/Ddem+ N🅪Sg/V+ NSg/R NPr/J/P NPr🅪/V/J+ . D/P
|
||||
> about as many words were ambiguous in that language as in English . A
|
||||
# J/P NSg/R NSg/I/J/Dq NPl/V+ NSg/V J NPr/J/P NSg/I/C/Ddem N🅪Sg/V+ NSg/R NPr/J/P NPr🅪/V/J+ . D/P
|
||||
> morphosyntactic descriptor in the case of morphologically rich languages is
|
||||
# ? NSg NPr/J/P D NPr/V P ? NPr/V/J NPl/V+ VL
|
||||
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
|
||||
# R V/J V J/R NPr/V/J/P+ NPl . NSg/I NSg/R ? C/P NSg . NSg/V+ .
|
||||
> Type = common , Gender = masculine , Number = singular , Case = accusative , Animate
|
||||
# NSg/V . NSg/V/J . NSg/V/J . NSg/J . NSg/V/JC . NSg/J . NPr/V . NSg/J . V/J
|
||||
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
|
||||
# R V/J V J/R NPr/V/J/P NPl . NSg/I NSg/R ? C/P NSg+ . NSg/V+ .
|
||||
> Type = common , Gender = masculine , Number = singular , Case = accusative , Animate
|
||||
# NSg/V+ . NSg/V/J . NSg/V/J+ . NSg/J . NSg/V/JC+ . NSg/J . NPr/V+ . NSg/J . V/J
|
||||
> = no .
|
||||
# . NPr/P .
|
||||
>
|
||||
#
|
||||
> The most popular " tag set " for POS tagging for American English is probably the
|
||||
# D NSg/I/J/Dq NSg/J . NSg/V+ NPr/V/J . C/P NSg+ NSg/V C/P NPr/J NPr🅪/V/J+ VL R D+
|
||||
> The most popular " tag set " for POS tagging for American English is probably the
|
||||
# D NSg/I/J/Dq NSg/J . NSg/V NPr/V/J . C/P NSg+ NSg/V C/P NPr/J NPr🅪/V/J+ VL R D
|
||||
> Penn tag set , developed in the Penn Treebank project . It is largely similar to
|
||||
# NPr+ NSg/V+ NPr/V/J . V/J NPr/J/P D+ NPr+ ? NSg/V+ . NPr/ISg+ VL R NSg/J P
|
||||
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
|
||||
# D JC NPr/V/J NSg V/C NSg/V NSg+ NSg/V+ NPl/V . V/C NSg/I/J/Dq+ NSg/JC+ . NPr/J/P
|
||||
# NPr+ NSg/V+ NPr/V/J . V/J NPr/J/P D NPr+ ? NSg/V+ . NPr/ISg+ VL R NSg/J P
|
||||
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
|
||||
# D JC NPr/V/J NSg V/C NSg/V NSg+ NSg/V+ NPl/V . V/C NSg/I/J/Dq NSg/JC . NPr/J/P
|
||||
> Europe , tag sets from the Eagles Guidelines see wide use and include versions
|
||||
# NPr+ . NSg/V+ NPl/V P D+ NPl/V+ NPl+ NSg/V NSg/J NSg/V+ V/C NSg/V NPl/V
|
||||
> for multiple languages .
|
||||
# C/P NSg/J/Dq+ NPl/V+ .
|
||||
# NPr+ . NSg/V+ NPl/V P D NPl/V NPl+ NSg/V NSg/J NSg/V+ V/C NSg/V NPl/V+
|
||||
> for multiple languages .
|
||||
# C/P NSg/J/Dq NPl/V+ .
|
||||
>
|
||||
#
|
||||
> POS tagging work has been done in a variety of languages , and the set of POS
|
||||
# NSg+ NSg/V NSg/V+ V NSg/V NSg/V/J NPr/J/P D/P NSg P NPl/V+ . V/C D NPr/V/J P NSg+
|
||||
> tags used varies greatly with language . Tags usually are designed to include
|
||||
# NPl/V+ V/J NPl/V R P N🅪Sg/V+ . NPl/V+ R V V/J P NSg/V
|
||||
> overt morphological distinctions , although this leads to inconsistencies such as
|
||||
# NSg/J J+ NPl+ . C I/Ddem+ NPl/V P NPl NSg/I NSg/R
|
||||
> overt morphological distinctions , although this leads to inconsistencies such as
|
||||
# NSg/J+ J+ NPl+ . C I/Ddem NPl/V P NPl NSg/I NSg/R
|
||||
> case - marking for pronouns but not nouns in English , and much larger
|
||||
# NPr/V+ . NSg/V C/P NPl/V NSg/C/P NSg/C NPl/V NPr/J/P NPr🅪/V/J+ . V/C NSg/I/J/Dq JC
|
||||
> cross - language differences . The tag sets for heavily inflected languages such as
|
||||
# NPr/V/J/P+ . N🅪Sg/V+ NSg/V . D+ NSg/V+ NPl/V C/P R V/J NPl/V+ NSg/I NSg/R
|
||||
# NPr/V/J/P+ . N🅪Sg/V+ NSg/V+ . D+ NSg/V+ NPl/V C/P R V/J NPl/V+ NSg/I NSg/R
|
||||
> Greek and Latin can be very large ; tagging words in agglutinative languages such
|
||||
# NPr/V/J V/C NPr/J NPr/VX NSg/VX J/R NSg/J . NSg/V NPl/V+ NPr/J/P ? NPl/V+ NSg/I
|
||||
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
|
||||
# NSg/R NPr/J NPl/V+ NPr/VX NSg/VX R+ NSg/J+ . NSg/P D+ NSg/V/J+ NSg/J . ? ?
|
||||
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
|
||||
# NSg/R NPr/J NPl/V+ NPr/VX NSg/VX R NSg/J . NSg/P D NSg/V/J NSg/J . ? ?
|
||||
> al. have proposed a " universal " tag set , with 12 categories ( for example , no
|
||||
# ? NSg/VX V/J D/P . NSg/J . NSg/V+ NPr/V/J . P # NPl . C/P NSg/V+ . NPr/P
|
||||
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
|
||||
# NPl P NPl/V . NPl/V+ . NᴹSg+ . V/C NSg/I/J/C J/P+ . . I/C D/P J/R NPr/V/J NPr/V/J P
|
||||
> very broad tags or a much larger set of more precise ones is preferable , depends
|
||||
# J/R NSg/J NPl/V NPr/C D/P NSg/I/J/Dq JC NPr/V/J P NPr/I/V/J/Dq V/J NPl/V+ VL W? . NPl/V
|
||||
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
|
||||
# J/P D+ NSg/V NSg/P NSg/V+ . NSg/J NSg/V VL NSg/JC J/P NSg/JC NSg/V+ . NPl/V+ .
|
||||
# ? NSg/VX V/J D/P . NSg/J . NSg/V+ NPr/V/J . P # NPl+ . C/P NSg/V+ . NPr/P
|
||||
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
|
||||
# NPl P NPl/V . NPl/V+ . NᴹSg+ . V/C NSg/I/J/C J/P . . I/C D/P J/R NPr/V/J NPr/V/J P
|
||||
> very broad tags or a much larger set of more precise ones is preferable , depends
|
||||
# J/R NSg/J NPl/V+ NPr/C D/P NSg/I/J/Dq JC NPr/V/J P NPr/I/V/J/Dq V/J+ NPl/V+ VL W? . NPl/V
|
||||
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
|
||||
# J/P D NSg/V+ NSg/P NSg/V+ . NSg/J NSg/V VL NSg/JC J/P NSg/JC NSg/V+ . NPl/V .
|
||||
>
|
||||
#
|
||||
> History
|
||||
# N🅪Sg
|
||||
# N🅪Sg+
|
||||
>
|
||||
#
|
||||
> The Brown Corpus
|
||||
# D NPr/V/J+ NSg
|
||||
# D+ NPr/V/J+ NSg+
|
||||
>
|
||||
#
|
||||
> Research on part - of - speech tagging has been closely tied to corpus linguistics .
|
||||
# NᴹSg/V J/P NSg/V/J . P . N🅪Sg/V NSg/V V NSg/V R V/J P NSg NᴹSg+ .
|
||||
> The first major corpus of English for computer analysis was the Brown Corpus
|
||||
# D NSg/V/J NPr/V/J NSg P NPr🅪/V/J+ C/P NSg/V+ N🅪Sg+ V D NPr/V/J NSg
|
||||
> Research on part - of - speech tagging has been closely tied to corpus linguistics .
|
||||
# NᴹSg/V J/P NSg/V/J+ . P . N🅪Sg/V+ NSg/V V NSg/V R V/J P NSg NᴹSg+ .
|
||||
> The first major corpus of English for computer analysis was the Brown Corpus
|
||||
# D NSg/V/J NPr/V/J NSg P NPr🅪/V/J C/P NSg/V+ N🅪Sg+ V D NPr/V/J NSg
|
||||
> developed at Brown University by Henry Kučera and W. Nelson Francis , in the
|
||||
# V/J NSg/P NPr/V/J NSg NSg/J/P NPr+ ? V/C ? NPr+ NPr+ . NPr/J/P D
|
||||
# V/J NSg/P NPr/V/J NSg+ NSg/J/P NPr+ ? V/C ? NPr+ NPr+ . NPr/J/P D
|
||||
> mid - 1960s . It consists of about 1 , 000 , 000 words of running English prose text ,
|
||||
# NSg/J/P+ . #d . NPr/ISg+ NPl/V P J/P # . # . # NPl/V P NSg/V/J/P NPr🅪/V/J+ NSg/V N🅪Sg/V+ .
|
||||
> made up of 500 samples from randomly chosen publications . Each sample is 2 , 000
|
||||
# V NSg/V/J/P P # NPl/V+ P R+ NᴹSg/V/J NPl+ . Dq+ NSg/V+ VL # . #
|
||||
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
|
||||
# NPr/C NPr/I/V/J/Dq NPl/V+ . NSg/V NSg/P D NSg/V/J+ NSg/V+ . NSg/V P # . # NPl/V+ . NSg/I/J/C NSg/I/C/Ddem D+
|
||||
# V NSg/V/J/P P # NPl/V+ P R NᴹSg/V/J NPl+ . Dq+ NSg/V+ VL # . #
|
||||
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
|
||||
# NPr/C NPr/I/V/J/Dq NPl/V+ . NSg/V NSg/P D NSg/V/J NSg/V+ . NSg/V+ P # . # NPl/V+ . NSg/I/J/C NSg/I/C/Ddem D
|
||||
> corpus contains only complete sentences ) .
|
||||
# NSg+ V J/R/C NSg/V/J+ NPl/V+ . .
|
||||
# NSg+ V J/R/C NSg/V/J NPl/V+ . .
|
||||
>
|
||||
#
|
||||
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
|
||||
# D+ NPr/V/J NSg V R . V/J . P NSg/V/J . P . N🅪Sg/V NPl/V NSg/V/J/P
|
||||
> many years . A first approximation was done with a program by Greene and Rubin ,
|
||||
# NSg/I/J/Dq+ NPl+ . D/P+ NSg/V/J+ NSg+ V NSg/V/J P D/P NPr/V NSg/J/P NPr V/C NPr .
|
||||
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
|
||||
# D+ NPr/V/J+ NSg+ V R . V/J . P NSg/V/J+ . P . N🅪Sg/V+ NPl/V NSg/V/J/P
|
||||
> many years . A first approximation was done with a program by Greene and Rubin ,
|
||||
# NSg/I/J/Dq+ NPl+ . D/P+ NSg/V/J+ NSg+ V NSg/V/J P D/P+ NPr/V+ NSg/J/P NPr V/C NPr .
|
||||
> which consisted of a huge handmade list of what categories could co - occur at
|
||||
# I/C+ V/J P D/P J NSg/J NSg/V P NSg/I+ NPl+ NSg/VX NPr/I/V+ . V NSg/P+
|
||||
# I/C+ V/J P D/P J NSg/J NSg/V P NSg/I+ NPl+ NSg/VX NPr/I/V+ . V NSg/P
|
||||
> all . For example , article then noun can occur , but article then verb ( arguably )
|
||||
# NSg/I/J/C/Dq . C/P NSg/V+ . NSg/V+ NSg/J/C NSg/V+ NPr/VX V . NSg/C/P NSg/V+ NSg/J/C NSg/V+ . R .
|
||||
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
|
||||
# NSg/V . D+ NPr/V+ V J/P # . NSg/V/J+ . ISg/D$+ NPl/V+ NSg/V R V/J
|
||||
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
|
||||
# NSg/V . D+ NPr/V+ V J/P # . NSg/V/J . ISg/D$+ NPl/V+ NSg/V R V/J
|
||||
> and corrected by hand , and later users sent in errata so that by the late 70 s
|
||||
# V/C V/J NSg/J/P NSg/V+ . V/C JC NPl+ NSg/V NPr/J/P NSg NSg/I/J/C NSg/I/C/Ddem+ NSg/J/P D NSg/J # ?
|
||||
> the tagging was nearly perfect ( allowing for some cases on which even human
|
||||
# D NSg/V V R NSg/V/J . V C/P I/J/R/Dq NPl/V+ J/P I/C+ NSg/V/J NSg/V/J
|
||||
# D NSg/V V R NSg/V/J . V C/P I/J/R/Dq NPl/V+ J/P I/C+ NSg/V/J NSg/V/J+
|
||||
> speakers might not agree ) .
|
||||
# + NᴹSg/VX/J NSg/C V . .
|
||||
>
|
||||
#
|
||||
> This corpus has been used for innumerable studies of word - frequency and of
|
||||
# I/Ddem+ NSg V NSg/V V/J C/P J NPl/V P NSg/V+ . NSg V/C P
|
||||
> part - of - speech and inspired the development of similar " tagged " corpora in many
|
||||
# NSg/V/J . P . N🅪Sg/V V/C V/J D N🅪Sg P NSg/J . V/J . NPl NPr/J/P NSg/I/J/Dq+
|
||||
> other languages . Statistics derived by analyzing it formed the basis for most
|
||||
# NSg/V/J+ NPl/V+ . NPl/V+ V/J NSg/J/P V NPr/ISg+ V/J D NSg C/P NSg/I/J/Dq
|
||||
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
|
||||
# JC NSg/V/J . P . N🅪Sg/V NSg/V NPl . NSg/I NSg/R NPl/V+ V/C ? . C . NSg/J/P
|
||||
# I/Ddem+ NSg+ V NSg/V V/J C/P J NPl/V P NSg/V+ . NSg V/C P
|
||||
> part - of - speech and inspired the development of similar " tagged " corpora in many
|
||||
# NSg/V/J+ . P . N🅪Sg/V+ V/C V/J D N🅪Sg P NSg/J . V/J . NPl+ NPr/J/P NSg/I/J/Dq
|
||||
> other languages . Statistics derived by analyzing it formed the basis for most
|
||||
# NSg/V/J NPl/V+ . NPl/V+ V/J NSg/J/P V NPr/ISg+ V/J D+ NSg+ C/P NSg/I/J/Dq
|
||||
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
|
||||
# JC NSg/V/J+ . P . N🅪Sg/V+ NSg/V NPl+ . NSg/I NSg/R NPl/V+ V/C ? . C . NSg/J/P
|
||||
> this time ( 2005 ) it has been superseded by larger corpora such as the 100
|
||||
# I/Ddem+ N🅪Sg/V/J+ . # . NPr/ISg+ V NSg/V V/J NSg/J/P JC NPl+ NSg/I NSg/R D #
|
||||
> million word British National Corpus , even though larger corpora are rarely so
|
||||
# NSg NSg/V+ NPr/J NSg/J+ NSg+ . NSg/V/J V/C JC+ NPl+ V R NSg/I/J/C
|
||||
# NSg NSg/V+ NPr/J NSg/J NSg+ . NSg/V/J V/C JC NPl+ V R NSg/I/J/C
|
||||
> thoroughly curated .
|
||||
# R+ V/J+ .
|
||||
# R V/J .
|
||||
>
|
||||
#
|
||||
> For some time , part - of - speech tagging was considered an inseparable part of
|
||||
# C/P I/J/R/Dq N🅪Sg/V/J . NSg/V/J . P . N🅪Sg/V NSg/V V V/J D/P NSg/J NSg/V/J P
|
||||
> For some time , part - of - speech tagging was considered an inseparable part of
|
||||
# C/P I/J/R/Dq N🅪Sg/V/J+ . NSg/V/J+ . P . N🅪Sg/V+ NSg/V V V/J D/P NSg/J NSg/V/J P
|
||||
> natural language processing , because there are certain cases where the correct
|
||||
# NSg/J+ N🅪Sg/V+ V+ . C/P + V I/J NPl/V+ NSg/C D NSg/V/J
|
||||
# NSg/J N🅪Sg/V+ V+ . C/P + V I/J NPl/V+ NSg/C D NSg/V/J
|
||||
> part of speech cannot be decided without understanding the semantics or even the
|
||||
# NSg/V/J P N🅪Sg/V+ NSg/V NSg/VX NSg/V/J C/P NᴹSg/V/J+ D+ NPl NPr/C NSg/V/J D
|
||||
# NSg/V/J P N🅪Sg/V+ NSg/V NSg/VX NSg/V/J C/P NᴹSg/V/J+ D NPl+ NPr/C NSg/V/J D
|
||||
> pragmatics of the context . This is extremely expensive , especially because
|
||||
# NPl P D+ N🅪Sg/V+ . I/Ddem+ VL R J . R C/P
|
||||
# NPl P D N🅪Sg/V+ . I/Ddem+ VL R J . R C/P
|
||||
> analyzing the higher levels is much harder when multiple part - of - speech
|
||||
# V D+ NSg/JC+ NPl/V+ VL NSg/I/J/Dq JC NSg/I/C NSg/J/Dq NSg/V/J . P . N🅪Sg/V
|
||||
# V D+ NSg/JC+ NPl/V+ VL NSg/I/J/Dq JC NSg/I/C NSg/J/Dq NSg/V/J . P . N🅪Sg/V+
|
||||
> possibilities must be considered for each word .
|
||||
# NPl NSg/V NSg/VX V/J C/P Dq+ NSg/V+ .
|
||||
# NPl+ NSg/V NSg/VX V/J C/P Dq+ NSg/V+ .
|
||||
>
|
||||
#
|
||||
> Use of hidden Markov models
|
||||
# NSg/V P V/J NPr+ NPl/V
|
||||
# NSg/V P V/J NPr NPl/V+
|
||||
>
|
||||
#
|
||||
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
|
||||
# NPr/J/P D NSg/J/P . #d . NPl NPr/J/P NPr+ V P NSg/V V/J NPr NPl/V+ . ? .
|
||||
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
|
||||
# NPr/J/P D NSg/J/P+ . #d . NPl NPr/J/P NPr+ V P NSg/V V/J NPr NPl/V+ . ? .
|
||||
> to disambiguate parts of speech , when working to tag the Lancaster - Oslo - Bergen
|
||||
# P V NPl/V P N🅪Sg/V+ . NSg/I/C V P NSg/V D NPr . NPr+ . NPr
|
||||
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
|
||||
# NSg P NPr/J+ NPr🅪/V/J+ . ? V V NPl/V . NSg/I NSg/R P D+ NPr/V/J+
|
||||
# P V NPl/V P N🅪Sg/V+ . NSg/I/C V P NSg/V D NPr . NPr+ . NPr+
|
||||
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
|
||||
# NSg P NPr/J NPr🅪/V/J+ . ? V V NPl/V+ . NSg/I NSg/R P D NPr/V/J
|
||||
> Corpus ) and making a table of the probabilities of certain sequences . For
|
||||
# NSg+ . V/C NSg/V D/P NSg/V P D NPl P I/J+ NPl/V+ . C/P
|
||||
> example , once you've seen an article such as ' the ' , perhaps the next word is a
|
||||
# NSg/V+ . NSg/C W? NSg/V D/P NSg/V+ NSg/I NSg/R . D . . NSg D+ NSg/J/P+ NSg/V+ VL D/P
|
||||
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
|
||||
# NSg/V # . P D+ N🅪Sg/V/J+ . D/P+ NSg/V/J+ # . . V/C D/P+ NSg/V/JC+ # . . NSg/V/J/P I/Ddem+ . D/P+
|
||||
# NSg+ . V/C NSg/V D/P NSg/V P D NPl P I/J NPl/V+ . C/P
|
||||
> example , once you've seen an article such as ' the ' , perhaps the next word is a
|
||||
# NSg/V+ . NSg/C W? NSg/V D/P NSg/V+ NSg/I NSg/R . D . . NSg D NSg/J/P NSg/V+ VL D/P
|
||||
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
|
||||
# NSg/V+ # . P D N🅪Sg/V/J+ . D/P NSg/V/J+ # . . V/C D/P NSg/V/JC+ # . . NSg/V/J/P I/Ddem+ . D/P+
|
||||
> program can decide that " can " in " the can " is far more likely to be a noun than
|
||||
# NPr/V+ NPr/VX V NSg/I/C/Ddem+ . NPr/VX . NPr/J/P . D+ NPr/VX . VL NSg/V/J NPr/I/V/J/Dq NSg/J P NSg/VX D/P NSg/V C/P
|
||||
> a verb or a modal . The same method can , of course , be used to benefit from
|
||||
# D/P NSg/V NPr/C D/P+ NSg/J+ . D+ I/J+ NSg/V+ NPr/VX . P NSg/V+ . NSg/VX V/J P NSg/V P
|
||||
> knowledge about the following words .
|
||||
# NᴹSg+ J/P D+ NSg/V/J/P+ NPl/V .
|
||||
> a verb or a modal . The same method can , of course , be used to benefit from
|
||||
# D/P+ NSg/V+ NPr/C D/P NSg/J . D+ I/J+ NSg/V+ NPr/VX . P NSg/V+ . NSg/VX V/J P NSg/V P
|
||||
> knowledge about the following words .
|
||||
# NᴹSg+ J/P D+ NSg/V/J/P NPl/V+ .
|
||||
>
|
||||
#
|
||||
> More advanced ( " higher - order " ) HMMs learn the probabilities not only of pairs
|
||||
# NPr/I/V/J/Dq V/J . . NSg/JC . NSg/V . . ? NSg/V D+ NPl+ NSg/C J/R/C P NPl/V+
|
||||
# NPr/I/V/J/Dq V/J . . NSg/JC . NSg/V . . ? NSg/V D NPl+ NSg/C J/R/C P NPl/V+
|
||||
> but triples or even larger sequences . So , for example , if you've just seen a
|
||||
# NSg/C/P NPl/V NPr/C NSg/V/J JC NPl/V+ . NSg/I/J/C . C/P NSg/V+ . NSg/C W? V/J NSg/V D/P
|
||||
> noun followed by a verb , the next item may be very likely a preposition ,
|
||||
# NSg/V V/J NSg/J/P D/P+ NSg/V+ . D+ NSg/J/P+ NSg/V+ NPr/VX NSg/VX J/R NSg/J D/P NSg/V .
|
||||
> article , or noun , but much less likely another verb .
|
||||
# NSg/V+ . NPr/C NSg/V+ . NSg/C/P NSg/I/J/Dq V/J/C/P NSg/J+ I/D NSg/V .
|
||||
> noun followed by a verb , the next item may be very likely a preposition ,
|
||||
# NSg/V+ V/J NSg/J/P D/P NSg/V+ . D NSg/J/P NSg/V+ NPr/VX NSg/VX J/R NSg/J D/P NSg/V .
|
||||
> article , or noun , but much less likely another verb .
|
||||
# NSg/V+ . NPr/C NSg/V+ . NSg/C/P NSg/I/J/Dq V/J/C/P NSg/J I/D NSg/V+ .
|
||||
>
|
||||
#
|
||||
> When several ambiguous words occur together , the possibilities multiply .
|
||||
# NSg/I/C J/Dq J NPl/V+ V J . D+ NPl NSg/V+ .
|
||||
# NSg/I/C J/Dq+ J+ NPl/V+ V J . D+ NPl+ NSg/V .
|
||||
> However , it is easy to enumerate every combination and to assign a relative
|
||||
# C . NPr/ISg+ VL NSg/V/J P V Dq+ N🅪Sg+ V/C P NSg/V D/P NSg/J
|
||||
> probability to each one , by multiplying together the probabilities of each
|
||||
# NSg P Dq+ NSg/I/V/J+ . NSg/J/P V J D NPl P Dq+
|
||||
> choice in turn . The combination with the highest probability is then chosen . The
|
||||
# NSg/J+ NPr/J/P NSg/V . D N🅪Sg P D+ JS+ NSg+ VL NSg/J/C+ NᴹSg/V/J . D+
|
||||
> European group developed CLAWS , a tagging program that did exactly this and
|
||||
# NSg/J+ NSg/V+ V/J NPl/V+ . D/P NSg/V+ NPr/V+ NSg/I/C/Ddem+ V R I/Ddem+ V/C
|
||||
# NSg+ P Dq NSg/I/V/J+ . NSg/J/P V J D NPl P Dq
|
||||
> choice in turn . The combination with the highest probability is then chosen . The
|
||||
# NSg/J+ NPr/J/P NSg/V . D N🅪Sg P D+ JS+ NSg+ VL NSg/J/C NᴹSg/V/J . D+
|
||||
> European group developed CLAWS , a tagging program that did exactly this and
|
||||
# NSg/J+ NSg/V+ V/J NPl/V+ . D/P NSg/V NPr/V+ NSg/I/C/Ddem+ V R I/Ddem V/C
|
||||
> achieved accuracy in the 93 – 95 % range .
|
||||
# V/J N🅪Sg+ NPr/J/P D # . # . NSg/V+ .
|
||||
>
|
||||
#
|
||||
> Eugene Charniak points out in Statistical techniques for natural language
|
||||
# NPr+ ? NPl/V+ NSg/V/J/R/P NPr/J/P J NPl C/P NSg/J N🅪Sg/V+
|
||||
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
|
||||
# V . # . NSg/I/C/Ddem+ R V D NSg/I/J/Dq NSg/V/J NSg/V P Dq+ V/J NSg/V V/C
|
||||
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
|
||||
# D NSg/V+ . NSg/J NSg/V . P NSg/I/J/C/Dq+ NPl/V+ NPr/VX NSg/V # . N🅪Sg+ C/P NSg/I/J/Dq+
|
||||
> words are unambiguous , and many others only rarely represent their less - common
|
||||
# NPl/V+ V J . V/C NSg/I/J/Dq+ NPl/V+ J/R/C R V D$+ V/J/C/P . NSg/V/J
|
||||
# NPr+ ? NPl/V+ NSg/V/J/R/P NPr/J/P J NPl C/P NSg/J+ N🅪Sg/V+
|
||||
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
|
||||
# V . # . NSg/I/C/Ddem+ R V D NSg/I/J/Dq NSg/V/J NSg/V+ P Dq V/J NSg/V+ V/C
|
||||
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
|
||||
# D NSg/V+ . NSg/J NSg/V+ . P NSg/I/J/C/Dq NPl/V+ NPr/VX NSg/V+ # . N🅪Sg+ C/P NSg/I/J/Dq
|
||||
> words are unambiguous , and many others only rarely represent their less - common
|
||||
# NPl/V+ V J . V/C NSg/I/J/Dq NPl/V+ J/R/C R V D$+ V/J/C/P . NSg/V/J
|
||||
> parts of speech .
|
||||
# NPl/V P N🅪Sg/V+ .
|
||||
>
|
||||
#
|
||||
> CLAWS pioneered the field of HMM - based part of speech tagging but was quite
|
||||
# NPl/V+ V/J D NSg/V P V . V/J NSg/V/J P N🅪Sg/V+ NSg/V NSg/C/P V NSg
|
||||
> expensive since it enumerated all possibilities . It sometimes had to resort to
|
||||
# J C/P NPr/ISg+ V/J NSg/I/J/C/Dq+ NPl+ . NPr/ISg+ R V P NSg/V P
|
||||
> backup methods when there were simply too many options ( the Brown Corpus
|
||||
# NSg/J NPl/V+ NSg/I/C + NSg/V R W? NSg/I/J/Dq+ NPl/V . D+ NPr/V/J+ NSg+
|
||||
> contains a case with 17 ambiguous words in a row , and there are words such as
|
||||
# V D/P NPr/V P # J NPl/V NPr/J/P D/P+ NSg/V+ . V/C + V NPl/V+ NSg/I NSg/R
|
||||
> expensive since it enumerated all possibilities . It sometimes had to resort to
|
||||
# J C/P NPr/ISg+ V/J NSg/I/J/C/Dq NPl+ . NPr/ISg+ R V P NSg/V P
|
||||
> backup methods when there were simply too many options ( the Brown Corpus
|
||||
# NSg/J NPl/V+ NSg/I/C + NSg/V R W? NSg/I/J/Dq NPl/V . D+ NPr/V/J+ NSg+
|
||||
> contains a case with 17 ambiguous words in a row , and there are words such as
|
||||
# V D/P NPr/V+ P # J NPl/V NPr/J/P D/P+ NSg/V+ . V/C + V NPl/V+ NSg/I NSg/R
|
||||
> " still " that can represent as many as 7 distinct parts of speech .
|
||||
# . NSg/V/J . NSg/I/C/Ddem+ NPr/VX V NSg/R NSg/I/J/Dq NSg/R # V/J NPl/V P N🅪Sg/V+ .
|
||||
>
|
||||
#
|
||||
> HMMs underlie the functioning of stochastic taggers and are used in various
|
||||
# ? V D V P J NPl V/C V V/J NPr/J/P J
|
||||
# ? V D V+ P J NPl V/C V V/J NPr/J/P J
|
||||
> algorithms one of the most widely used being the bi - directional inference
|
||||
# NPl+ NSg/I/V/J P D NSg/I/J/Dq R V/J NSg/V/C D NSg/J . NSg/J NSg+
|
||||
> algorithm .
|
||||
# NSg+ .
|
||||
# NSg .
|
||||
>
|
||||
#
|
||||
> Dynamic programming methods
|
||||
# NSg/J+ NᴹSg/V+ NPl/V
|
||||
# NSg/J+ NᴹSg/V+ NPl/V+
|
||||
>
|
||||
#
|
||||
> In 1987 , Steven DeRose and Kenneth W. Church independently developed dynamic
|
||||
# NPr/J/P # . NPr+ ? V/C NPr+ ? NPr/V+ R V/J NSg/J
|
||||
> programming algorithms to solve the same problem in vastly less time . Their
|
||||
# NᴹSg/V+ NPl+ P NSg/V D I/J NSg/J NPr/J/P R V/J/C/P N🅪Sg/V/J+ . D$+
|
||||
# NᴹSg/V+ NPl+ P NSg/V D I/J NSg/J+ NPr/J/P R V/J/C/P N🅪Sg/V/J+ . D$+
|
||||
> methods were similar to the Viterbi algorithm known for some time in other
|
||||
# NPl/V+ NSg/V NSg/J P D ? NSg V/J C/P I/J/R/Dq N🅪Sg/V/J+ NPr/J/P NSg/V/J+
|
||||
# NPl/V+ NSg/V NSg/J P D ? NSg V/J C/P I/J/R/Dq N🅪Sg/V/J+ NPr/J/P NSg/V/J
|
||||
> fields . DeRose used a table of pairs , while Church used a table of triples and a
|
||||
# NPrPl/V+ . ? V/J D/P NSg/V P NPl/V+ . NSg/V/C/P NPr/V+ V/J D/P NSg/V P NPl/V V/C D/P
|
||||
> method of estimating the values for triples that were rare or nonexistent in the
|
||||
# NSg/V P V D NPl/V C/P NPl/V NSg/I/C/Ddem+ NSg/V NSg/V/J NPr/C NSg/J NPr/J/P D+
|
||||
> Brown Corpus ( an actual measurement of triple probabilities would require a much
|
||||
# NPr/V/J+ NSg . D/P NSg/J NSg P NSg/V/J NPl+ VX NSg/V D/P NSg/I/J/Dq
|
||||
# NSg/V P V D NPl/V+ C/P NPl/V NSg/I/C/Ddem+ NSg/V NSg/V/J NPr/C NSg/J NPr/J/P D
|
||||
> Brown Corpus ( an actual measurement of triple probabilities would require a much
|
||||
# NPr/V/J NSg+ . D/P NSg/J NSg P NSg/V/J NPl+ VX NSg/V D/P NSg/I/J/Dq
|
||||
> larger corpus ) . Both methods achieved an accuracy of over 95 % . DeRose's 1990
|
||||
# JC NSg+ . . I/C/Dq NPl/V+ V/J D/P N🅪Sg P NSg/V/J/P # . . ? #
|
||||
# JC NSg+ . . I/C/Dq NPl/V+ V/J D/P N🅪Sg+ P NSg/V/J/P # . . ? #
|
||||
> dissertation at Brown University included analyses of the specific error types ,
|
||||
# NSg+ NSg/P NPr/V/J NSg+ V/J NPl/V/Au/Br P D+ NSg/J+ NSg/V+ NPl/V+ .
|
||||
> probabilities , and other related data , and replicated his work for Greek , where
|
||||
# NPl+ . V/C NSg/V/J+ J+ N🅪Pl+ . V/C V/J ISg/D$+ NSg/V C/P NPr/V/J . NSg/C
|
||||
# NSg+ NSg/P NPr/V/J NSg+ V/J NPl/V/Au/Br P D NSg/J NSg/V+ NPl/V+ .
|
||||
> probabilities , and other related data , and replicated his work for Greek , where
|
||||
# NPl+ . V/C NSg/V/J J N🅪Pl+ . V/C V/J ISg/D$+ NSg/V+ C/P NPr/V/J . NSg/C
|
||||
> it proved similarly effective .
|
||||
# NPr/ISg+ V/J R+ NSg/J .
|
||||
# NPr/ISg+ V/J R NSg/J .
|
||||
>
|
||||
#
|
||||
> These findings were surprisingly disruptive to the field of natural language
|
||||
# I/Ddem+ NSg NSg/V R J P D NSg/V P NSg/J+ N🅪Sg/V+
|
||||
# I/Ddem+ NSg+ NSg/V R J P D NSg/V P NSg/J+ N🅪Sg/V+
|
||||
> processing . The accuracy reported was higher than the typical accuracy of very
|
||||
# V+ . D+ N🅪Sg+ V/J V NSg/JC C/P D NSg/J N🅪Sg P J/R
|
||||
> sophisticated algorithms that integrated part of speech choice with many higher
|
||||
# V/J NPl+ NSg/I/C/Ddem+ V/J NSg/V/J P N🅪Sg/V+ NSg/J P NSg/I/J/Dq NSg/JC
|
||||
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
|
||||
# NPl/V P J N🅪Sg+ . NSg+ . NSg+ . NPl+ . V/C NSg/I/J/C+ J/P . NPl/V .
|
||||
# V/J+ NPl+ NSg/I/C/Ddem+ V/J NSg/V/J P N🅪Sg/V+ NSg/J+ P NSg/I/J/Dq NSg/JC
|
||||
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
|
||||
# NPl/V P J N🅪Sg . NSg+ . NSg+ . NPl+ . V/C NSg/I/J/C J/P . NPl/V+ .
|
||||
> DeRose's and Church's methods did fail for some of the known cases where
|
||||
# ? V/C NSg$ NPl/V+ V NSg/V/J C/P I/J/R/Dq P D+ V/J+ NPl/V+ NSg/C
|
||||
> semantics is required , but those proved negligibly rare . This convinced many in
|
||||
# NPl+ VL V/J . NSg/C/P I/Ddem+ V/J R+ NSg/V/J+ . I/Ddem+ V/J NSg/I/J/Dq NPr/J/P
|
||||
> the field that part - of - speech tagging could usefully be separated from the other
|
||||
# D+ NSg/V+ NSg/I/C/Ddem+ NSg/V/J . P . N🅪Sg/V NSg/V NSg/VX R NSg/VX V/J P D NSg/V/J
|
||||
# ? V/C NSg$ NPl/V+ V NSg/V/J C/P I/J/R/Dq P D V/J NPl/V+ NSg/C
|
||||
> semantics is required , but those proved negligibly rare . This convinced many in
|
||||
# NPl+ VL V/J . NSg/C/P I/Ddem V/J R NSg/V/J . I/Ddem V/J NSg/I/J/Dq NPr/J/P
|
||||
> the field that part - of - speech tagging could usefully be separated from the other
|
||||
# D+ NSg/V+ NSg/I/C/Ddem+ NSg/V/J+ . P . N🅪Sg/V+ NSg/V NSg/VX R NSg/VX V/J P D NSg/V/J
|
||||
> levels of processing ; this , in turn , simplified the theory and practice of
|
||||
# NPl/V P V . I/Ddem+ . NPr/J/P NSg/V . V/J D+ NSg V/C NSg/V P
|
||||
# NPl/V P V+ . I/Ddem+ . NPr/J/P NSg/V . V/J D NSg V/C NSg/V P
|
||||
> computerized language analysis and encouraged researchers to find ways to
|
||||
# V/J N🅪Sg/V+ N🅪Sg+ V/C V/J NPl+ P NSg/V NPl+ P
|
||||
> separate other pieces as well . Markov Models became the standard method for the
|
||||
# NSg/V/J NSg/V/J+ NPl/V+ NSg/R+ NSg/V/J . NPr NPl/V+ V D NSg/J NSg/V C/P D
|
||||
> part - of - speech assignment .
|
||||
# NSg/V/J . P . N🅪Sg/V+ NSg+ .
|
||||
> separate other pieces as well . Markov Models became the standard method for the
|
||||
# NSg/V/J NSg/V/J NPl/V+ NSg/R NSg/V/J . NPr NPl/V+ V D NSg/J NSg/V+ C/P D
|
||||
> part - of - speech assignment .
|
||||
# NSg/V/J+ . P . N🅪Sg/V+ NSg+ .
|
||||
>
|
||||
#
|
||||
> Unsupervised taggers
|
||||
# V/J+ NPl
|
||||
# V/J NPl
|
||||
>
|
||||
#
|
||||
> The methods already discussed involve working from a pre - existing corpus to
|
||||
# D+ NPl/V W? V/J V V P D/P NSg/V/P+ . V NSg P
|
||||
> The methods already discussed involve working from a pre - existing corpus to
|
||||
# D+ NPl/V+ W? V/J V V P D/P+ NSg/V/P+ . V NSg+ P
|
||||
> learn tag probabilities . It is , however , also possible to bootstrap using
|
||||
# NSg/V NSg/V+ NPl+ . NPr/ISg+ VL . C . W? NSg/J P NSg/V V
|
||||
> " unsupervised " tagging . Unsupervised tagging techniques use an untagged corpus
|
||||
# . V/J . NSg/V . V/J NSg/V NPl+ NSg/V D/P J NSg
|
||||
> for their training data and produce the tagset by induction . That is , they
|
||||
# C/P D$+ NSg/V+ N🅪Pl+ V/C NSg/V D NSg NSg/J/P+ NSg . NSg/I/C/Ddem+ VL . IPl+
|
||||
> observe patterns in word use , and derive part - of - speech categories themselves .
|
||||
# NSg/V NPl/V+ NPr/J/P NSg/V+ NSg/V . V/C NSg/V NSg/V/J . P . N🅪Sg/V NPl+ IPl+ .
|
||||
# . V/J . NSg/V . V/J NSg/V NPl+ NSg/V D/P J NSg+
|
||||
> for their training data and produce the tagset by induction . That is , they
|
||||
# C/P D$+ NSg/V+ N🅪Pl+ V/C NSg/V D NSg NSg/J/P NSg . NSg/I/C/Ddem+ VL . IPl+
|
||||
> observe patterns in word use , and derive part - of - speech categories themselves .
|
||||
# NSg/V NPl/V+ NPr/J/P NSg/V+ NSg/V . V/C NSg/V NSg/V/J+ . P . N🅪Sg/V+ NPl+ IPl+ .
|
||||
> For example , statistics readily reveal that " the " , " a " , and " an " occur in
|
||||
# C/P NSg/V+ . NPl/V+ R NSg/V NSg/I/C/Ddem+ . D . . . D/P . . V/C . D/P . V NPr/J/P
|
||||
> similar contexts , while " eat " occurs in very different ones . With sufficient
|
||||
# NSg/J+ NPl/V+ . NSg/V/C/P . V . V NPr/J/P J/R NSg/J+ NPl/V+ . P J+
|
||||
# NSg/J+ NPl/V+ . NSg/V/C/P . V . V NPr/J/P J/R NSg/J+ NPl/V+ . P J
|
||||
> iteration , similarity classes of words emerge that are remarkably similar to
|
||||
# NSg . NSg NPl/V P NPl/V+ NSg/V NSg/I/C/Ddem+ V R NSg/J P
|
||||
> those human linguists would expect ; and the differences themselves sometimes
|
||||
# I/Ddem+ NSg/V/J NPl+ VX V . V/C D+ NSg/V+ IPl+ R
|
||||
> suggest valuable new insights .
|
||||
# V NSg/J+ NSg/V/J+ NPl+ .
|
||||
> those human linguists would expect ; and the differences themselves sometimes
|
||||
# I/Ddem NSg/V/J NPl+ VX V . V/C D NSg/V+ IPl+ R
|
||||
> suggest valuable new insights .
|
||||
# V NSg/J NSg/V/J NPl+ .
|
||||
>
|
||||
#
|
||||
> These two categories can be further subdivided into rule - based , stochastic , and
|
||||
# I/Ddem NSg+ NPl NPr/VX NSg/VX V/J V/J P NSg/V . V/J . J . V/C
|
||||
> These two categories can be further subdivided into rule - based , stochastic , and
|
||||
# I/Ddem+ NSg+ NPl+ NPr/VX NSg/VX V/J V/J P NSg/V+ . V/J . J . V/C
|
||||
> neural approaches .
|
||||
# J+ NPl/V+ .
|
||||
# J NPl/V+ .
|
||||
>
|
||||
#
|
||||
> Other taggers and methods
|
||||
# NSg/V/J+ NPl V/C NPl/V
|
||||
> Other taggers and methods
|
||||
# NSg/V/J NPl V/C NPl/V+
|
||||
>
|
||||
#
|
||||
> Some current major algorithms for part - of - speech tagging include the Viterbi
|
||||
# I/J/R/Dq+ NSg/J NPr/V/J NPl C/P NSg/V/J . P . N🅪Sg/V NSg/V NSg/V D ?
|
||||
> Some current major algorithms for part - of - speech tagging include the Viterbi
|
||||
# I/J/R/Dq NSg/J NPr/V/J NPl C/P NSg/V/J+ . P . N🅪Sg/V+ NSg/V NSg/V D ?
|
||||
> algorithm , Brill tagger , Constraint Grammar , and the Baum - Welch algorithm ( also
|
||||
# NSg . NSg/J NSg . NSg+ NSg/V+ . V/C D NPr . ? NSg . W?
|
||||
> known as the forward - backward algorithm ) . Hidden Markov model and visible Markov
|
||||
# V/J NSg/R D NSg/V/J . NSg/J NSg+ . . V/J NPr NSg/V/J+ V/C J NPr
|
||||
# V/J NSg/R D NSg/V/J . NSg/J NSg . . V/J NPr NSg/V/J+ V/C J NPr
|
||||
> model taggers can both be implemented using the Viterbi algorithm . The
|
||||
# NSg/V/J+ NPl NPr/VX I/C/Dq NSg/VX V/J V D+ ? NSg . D
|
||||
# NSg/V/J+ NPl NPr/VX I/C/Dq NSg/VX V/J V D ? NSg . D+
|
||||
> rule - based Brill tagger is unusual in that it learns a set of rule patterns , and
|
||||
# NSg/V+ . V/J NSg/J NSg VL NSg/J NPr/J/P NSg/I/C/Ddem NPr/ISg+ NPl/V D/P NPr/V/J P NSg/V+ NPl/V+ . V/C
|
||||
> then applies those patterns rather than optimizing a statistical quantity .
|
||||
# NSg/J/C V I/Ddem+ NPl/V+ NPr/V/J C/P V D/P+ J+ NSg+ .
|
||||
> then applies those patterns rather than optimizing a statistical quantity .
|
||||
# NSg/J/C V I/Ddem NPl/V+ NPr/V/J C/P V D/P J NSg+ .
|
||||
>
|
||||
#
|
||||
> Many machine learning methods have also been applied to the problem of POS
|
||||
# NSg/I/J/Dq+ NSg/V V+ NPl/V+ NSg/VX W? NSg/V V/J P D NSg/J P NSg+
|
||||
# NSg/I/J/Dq+ NSg/V+ V+ NPl/V+ NSg/VX W? NSg/V V/J P D NSg/J P NSg+
|
||||
> tagging . Methods such as SVM , maximum entropy classifier , perceptron , and
|
||||
# NSg/V+ . NPl/V+ NSg/I NSg/R ? . NSg/J NSg NSg . NSg . V/C
|
||||
> nearest - neighbor have all been tried , and most can achieve accuracy above
|
||||
# JS . NSg/V/J/Am NSg/VX NSg/I/J/C/Dq NSg/V V/J . V/C NSg/I/J/Dq NPr/VX V N🅪Sg+ NSg/J/P
|
||||
# NSg/V . NPl/V+ NSg/I NSg/R ? . NSg/J NSg NSg . NSg . V/C
|
||||
> nearest - neighbor have all been tried , and most can achieve accuracy above
|
||||
# JS . NSg/V/J/Am+ NSg/VX NSg/I/J/C/Dq NSg/V V/J . V/C NSg/I/J/Dq NPr/VX V N🅪Sg+ NSg/J/P
|
||||
> 95 % . [ citation needed ]
|
||||
# # . . . NSg+ V/J+ .
|
||||
# # . . . NSg+ V/J .
|
||||
>
|
||||
#
|
||||
> A direct comparison of several methods is reported ( with references ) at the ACL
|
||||
# D/P V/J NSg P J/Dq+ NPl/V+ VL V/J . P NPl/V+ . NSg/P D+ NSg+
|
||||
# D/P V/J NSg P J/Dq+ NPl/V+ VL V/J . P NPl/V+ . NSg/P D NSg
|
||||
> Wiki . This comparison uses the Penn tag set on some of the Penn Treebank data ,
|
||||
# NSg/V+ . I/Ddem+ NSg+ NPl/V D+ NPr+ NSg/V+ NPr/V/J J/P I/J/R/Dq P D+ NPr+ ? N🅪Pl+ .
|
||||
# NSg/V+ . I/Ddem+ NSg+ NPl/V D+ NPr+ NSg/V+ NPr/V/J J/P I/J/R/Dq P D NPr+ ? N🅪Pl+ .
|
||||
> so the results are directly comparable . However , many significant taggers are
|
||||
# NSg/I/J/C D+ NPl/V+ V R/C NSg/J+ . C . NSg/I/J/Dq NSg/J NPl V
|
||||
# NSg/I/J/C D NPl/V+ V R/C NSg/J . C . NSg/I/J/Dq NSg/J NPl V
|
||||
> not included ( perhaps because of the labor involved in reconfiguring them for
|
||||
# NSg/C V/J . NSg C/P P D+ NPr/V/Am/Au+ V/J NPr/J/P V NSg/IPl+ C/P
|
||||
> this particular dataset ) . Thus , it should not be assumed that the results
|
||||
# I/Ddem+ NSg/J+ NSg . . NSg . NPr/ISg+ VX NSg/C NSg/VX V/J NSg/I/C/Ddem D+ NPl/V+
|
||||
# NSg/C V/J . NSg C/P P D NPr/V/Am/Au+ V/J NPr/J/P V NSg/IPl+ C/P
|
||||
> this particular dataset ) . Thus , it should not be assumed that the results
|
||||
# I/Ddem NSg/J NSg . . NSg . NPr/ISg+ VX NSg/C NSg/VX V/J NSg/I/C/Ddem D+ NPl/V+
|
||||
> reported here are the best that can be achieved with a given approach ; nor even
|
||||
# V/J NSg/J/R V D NPr/VX/JS NSg/I/C/Ddem+ NPr/VX NSg/VX V/J P D/P+ NSg/V/J/P+ NSg/V+ . NSg/C NSg/V/J
|
||||
> the best that have been achieved with a given approach .
|
||||
@@ -435,6 +435,6 @@
|
||||
>
|
||||
#
|
||||
> In 2014 , a paper reporting using the structure regularization method for
|
||||
# NPr/J/P # . D/P+ N🅪Sg/V/J+ V V D+ NSg/V+ N🅪Sg NSg/V C/P
|
||||
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
|
||||
# NSg/V/J . P . N🅪Sg/V NSg/V . V # . J/P D/P NSg/J+ NSg/V+ NSg .
|
||||
# NPr/J/P # . D/P+ N🅪Sg/V/J+ V V D NSg/V+ N🅪Sg NSg/V C/P
|
||||
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
|
||||
# NSg/V/J+ . P . N🅪Sg/V+ NSg/V . V # . J/P D/P NSg/J NSg/V NSg .
|
||||
|
||||
@@ -3,15 +3,15 @@
|
||||
>
|
||||
#
|
||||
> This document contains a list of words spelled correctly in some dialects of English , but not American English . This is designed to test the spelling suggestions we give for such mistakes .
|
||||
# I/Ddem+ NSg/V V D/P NSg/V P NPl/V+ V/J R NPr/J/P I/J/R/Dq NPl P NPr🅪/V/J+ . NSg/C/P NSg/C NPr/J+ NPr🅪/V/J+ . I/Ddem+ VL V/J P NSg/V D+ NSg/V+ NPl+ IPl+ NSg/V C/P NSg/I+ NPl/V+ .
|
||||
# I/Ddem+ NSg/V+ V D/P NSg/V P NPl/V+ V/J R NPr/J/P I/J/R/Dq NPl P NPr🅪/V/J+ . NSg/C/P NSg/C NPr/J NPr🅪/V/J+ . I/Ddem+ VL V/J P NSg/V D+ NSg/V+ NPl+ IPl+ NSg/V C/P NSg/I+ NPl/V+ .
|
||||
>
|
||||
#
|
||||
> To achieve this , the filename of this file contains `.US , which will tell the snapshot generator to use the American dialect , rather than trying to use an automatically detected dialect .
|
||||
# P V I/Ddem . D NSg P I/Ddem+ NSg/V+ V Unlintable . I/C+ NPr/VX NPr/V D+ NSg/V+ NSg P NSg/V D+ NPr/J+ NSg+ . NPr/V/J C/P NSg/V/J P NSg/V D/P W? V/J NSg+ .
|
||||
> To achieve this , the filename of this file contains `.US , which will tell the snapshot generator to use the American dialect , rather than trying to use an automatically detected dialect .
|
||||
# P V I/Ddem+ . D NSg P I/Ddem NSg/V+ V Unlintable . I/C+ NPr/VX NPr/V D NSg/V+ NSg P NSg/V D NPr/J NSg+ . NPr/V/J C/P NSg/V/J P NSg/V D/P W? V/J NSg+ .
|
||||
>
|
||||
#
|
||||
> Words
|
||||
# NPl/V
|
||||
# NPl/V+
|
||||
>
|
||||
#
|
||||
>
|
||||
@@ -26,36 +26,36 @@
|
||||
# NSg/V/Comm+ .
|
||||
>
|
||||
#
|
||||
> Labelled .
|
||||
# V/J/Comm+ .
|
||||
> Labelled .
|
||||
# V/J/Comm .
|
||||
>
|
||||
#
|
||||
> Flavour .
|
||||
# N🅪Sg/V/Comm+ .
|
||||
>
|
||||
#
|
||||
> Favoured .
|
||||
# V/J/Comm+ .
|
||||
> Favoured .
|
||||
# V/J/Comm .
|
||||
>
|
||||
#
|
||||
> Honour .
|
||||
# N🅪Sg/V/Comm+ .
|
||||
>
|
||||
#
|
||||
> Grey .
|
||||
# NPr/V/J/Comm+ .
|
||||
> Grey .
|
||||
# NPr/V/J/Comm .
|
||||
>
|
||||
#
|
||||
> Quarrelled .
|
||||
# V/Comm+ .
|
||||
# V/Comm .
|
||||
>
|
||||
#
|
||||
> Quarrelling .
|
||||
# NᴹSg/V/Comm+ .
|
||||
> Quarrelling .
|
||||
# NᴹSg/V/Comm .
|
||||
>
|
||||
#
|
||||
> Recognised .
|
||||
# V/J/Au/Br+ .
|
||||
# V/J/Au/Br .
|
||||
>
|
||||
#
|
||||
> Neighbour .
|
||||
@@ -63,11 +63,11 @@
|
||||
>
|
||||
#
|
||||
> Neighbouring .
|
||||
# V/Comm+ .
|
||||
# V/Comm .
|
||||
>
|
||||
#
|
||||
> Clamour .
|
||||
# NSg/V/Comm+ .
|
||||
> Clamour .
|
||||
# NSg/V/Comm .
|
||||
>
|
||||
#
|
||||
> Theatre .
|
||||
|
||||
@@ -3,15 +3,15 @@
|
||||
>
|
||||
#
|
||||
> This document contains example sentences with misspelled words that we want to test the spell checker on .
|
||||
# I/Ddem+ NSg/V V NSg/V+ NPl/V P V/J+ NPl/V+ NSg/I/C/Ddem+ IPl+ NSg/V P NSg/V D NSg/V NSg/V J/P .
|
||||
# I/Ddem+ NSg/V+ V NSg/V+ NPl/V+ P V/J NPl/V+ NSg/I/C/Ddem+ IPl+ NSg/V P NSg/V D NSg/V NSg/V J/P .
|
||||
>
|
||||
#
|
||||
> Example Sentences
|
||||
# NSg/V+ NPl/V
|
||||
# NSg/V+ NPl/V+
|
||||
>
|
||||
#
|
||||
> My favourite color is blu .
|
||||
# D$+ NSg/V/J/Comm N🅪Sg/V/J/Am VL+ W? .
|
||||
> My favourite color is blu .
|
||||
# D$+ NSg/V/J/Comm+ N🅪Sg/V/J/Am+ VL W? .
|
||||
> I must defend my honour !
|
||||
# ISg+ NSg/V NSg/V D$+ N🅪Sg/V/Comm+ .
|
||||
> I recognize that you recognise me .
|
||||
|
||||
@@ -2,17 +2,17 @@
|
||||
# NPl/V
|
||||
>
|
||||
#
|
||||
> This documents tests that different forms / variations of swears are tagged as such .
|
||||
# I/Ddem+ NPl/V+ NPl/V+ NSg/I/C/Ddem NSg/J NPl/V . W? P NPl/V V V/J NSg/R NSg/I .
|
||||
> This documents tests that different forms / variations of swears are tagged as such .
|
||||
# I/Ddem+ NPl/V+ NPl/V+ NSg/I/C/Ddem NSg/J+ NPl/V+ . W? P NPl/V V V/J NSg/R NSg/I .
|
||||
>
|
||||
#
|
||||
> Examples
|
||||
# NPl/V
|
||||
# NPl/V+
|
||||
>
|
||||
#
|
||||
> One turd , two turds .
|
||||
# NSg/I/V/J+ NSg/V/B . NSg+ NPl/V/B .
|
||||
> One turd , two turds .
|
||||
# NSg/I/V/J+ NSg/V+/B . NSg NPl/V/B .
|
||||
>
|
||||
#
|
||||
> I fart , you're farting , he farts , she farted .
|
||||
# ISg+ NSg/V/B . + V/B . NPr/ISg+ NPl/V/B . ISg+ V/J+/B .
|
||||
# ISg+ NSg/V/B . + V/B . NPr/ISg+ NPl/V/B . ISg+ V/J/B .
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,11 +1,11 @@
|
||||
> " This " and " that " are common and fulfill multiple purposes in everyday English .
|
||||
# . I/Ddem+ . V/C . NSg/I/C/Ddem+ . V NSg/V/J V/C V/NoAm NSg/J/Dq NPl/V NPr/J/P NSg/J+ NPr🅪/V/J+ .
|
||||
> As such , disambiguating them is necessary .
|
||||
# NSg/R NSg/I . V NSg/IPl+ VL+ NSg/J .
|
||||
> As such , disambiguating them is necessary .
|
||||
# NSg/R NSg/I . V NSg/IPl+ VL NSg/J .
|
||||
>
|
||||
#
|
||||
> This document contains various sentences that use " this " , " that " , " these " , and
|
||||
# I/Ddem+ NSg/V V J NPl/V+ NSg/I/C/Ddem+ NSg/V . I/Ddem+ . . . NSg/I/C/Ddem+ . . . I/Ddem+ . . V/C
|
||||
> This document contains various sentences that use " this " , " that " , " these " , and
|
||||
# I/Ddem+ NSg/V+ V J+ NPl/V+ NSg/I/C/Ddem+ NSg/V . I/Ddem+ . . . NSg/I/C/Ddem+ . . . I/Ddem . . V/C
|
||||
> " those " in different contexts with a lot of edge cases .
|
||||
# . I/Ddem . NPr/J/P NSg/J NPl/V P D/P NPr/V P NSg/V+ NPl/V+ .
|
||||
>
|
||||
@@ -14,54 +14,54 @@
|
||||
# NPl/V+
|
||||
>
|
||||
#
|
||||
> This triangle is nice .
|
||||
# I/Ddem+ NSg+ VL+ NPr/V/J+ .
|
||||
> This is nice .
|
||||
# I/Ddem+ VL+ NPr/V/J+ .
|
||||
> That triangle is nice .
|
||||
# NSg/I/C/Ddem+ NSg+ VL+ NPr/V/J+ .
|
||||
> That is nice .
|
||||
# NSg/I/C/Ddem+ VL+ NPr/V/J+ .
|
||||
> These triangles are nice .
|
||||
# I/Ddem+ NPl+ V+ NPr/V/J+ .
|
||||
> These are nice .
|
||||
# I/Ddem+ V+ NPr/V/J+ .
|
||||
> Those triangles are nice .
|
||||
# I/Ddem+ NPl+ V+ NPr/V/J+ .
|
||||
> This triangle is nice .
|
||||
# I/Ddem NSg VL NPr/V/J .
|
||||
> This is nice .
|
||||
# I/Ddem+ VL NPr/V/J .
|
||||
> That triangle is nice .
|
||||
# NSg/I/C/Ddem+ NSg VL NPr/V/J .
|
||||
> That is nice .
|
||||
# NSg/I/C/Ddem+ VL NPr/V/J .
|
||||
> These triangles are nice .
|
||||
# I/Ddem NPl V NPr/V/J .
|
||||
> These are nice .
|
||||
# I/Ddem+ V NPr/V/J .
|
||||
> Those triangles are nice .
|
||||
# I/Ddem NPl V NPr/V/J .
|
||||
> Those are nice .
|
||||
# I/Ddem+ V+ NPr/V/J .
|
||||
# I/Ddem+ V NPr/V/J .
|
||||
>
|
||||
#
|
||||
> This massage is nice .
|
||||
# I/Ddem+ NSg/V+ VL+ NPr/V/J+ .
|
||||
> That massage is nice .
|
||||
# NSg/I/C/Ddem NSg/V+ VL+ NPr/V/J+ .
|
||||
> These massages are nice .
|
||||
# I/Ddem+ NPl/V+ V+ NPr/V/J+ .
|
||||
> Those massages are nice .
|
||||
# I/Ddem+ NPl/V+ V+ NPr/V/J+ .
|
||||
> This massages well .
|
||||
# I/Ddem+ NPl/V+ NSg/V/J+ .
|
||||
> That massages well .
|
||||
# NSg/I/C/Ddem+ NPl/V+ NSg/V/J+ .
|
||||
> These massage well .
|
||||
# I/Ddem+ NSg/V+ NSg/V/J+ .
|
||||
> Those massage well .
|
||||
# I/Ddem+ NSg/V+ NSg/V/J+ .
|
||||
> This massage is nice .
|
||||
# I/Ddem+ NSg/V+ VL NPr/V/J .
|
||||
> That massage is nice .
|
||||
# NSg/I/C/Ddem NSg/V+ VL NPr/V/J .
|
||||
> These massages are nice .
|
||||
# I/Ddem+ NPl/V+ V NPr/V/J .
|
||||
> Those massages are nice .
|
||||
# I/Ddem+ NPl/V+ V NPr/V/J .
|
||||
> This massages well .
|
||||
# I/Ddem+ NPl/V+ NSg/V/J .
|
||||
> That massages well .
|
||||
# NSg/I/C/Ddem+ NPl/V+ NSg/V/J .
|
||||
> These massage well .
|
||||
# I/Ddem+ NSg/V+ NSg/V/J .
|
||||
> Those massage well .
|
||||
# I/Ddem+ NSg/V+ NSg/V/J .
|
||||
>
|
||||
#
|
||||
> That could be a solution .
|
||||
# NSg/I/C/Ddem+ NSg/VX NSg/VX D/P NSg .
|
||||
> Find all candidates that could be a solution .
|
||||
# NSg/V NSg/I/J/C/Dq+ NPl/V+ NSg/I/C/Ddem+ NSg/VX NSg/VX D/P NSg+ .
|
||||
> That could be a solution .
|
||||
# NSg/I/C/Ddem+ NSg/VX NSg/VX D/P+ NSg+ .
|
||||
> Find all candidates that could be a solution .
|
||||
# NSg/V NSg/I/J/C/Dq+ NPl/V+ NSg/I/C/Ddem+ NSg/VX NSg/VX D/P+ NSg+ .
|
||||
>
|
||||
#
|
||||
> This is all that I have .
|
||||
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem ISg+ NSg/VX+ .
|
||||
> This is all that solutions can do .
|
||||
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem NPl+ NPr/VX+ NSg/VX .
|
||||
> That solution can do .
|
||||
# NSg/I/C/Ddem NSg+ NPr/VX+ NSg/VX .
|
||||
> This is all that I have .
|
||||
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem ISg+ NSg/VX .
|
||||
> This is all that solutions can do .
|
||||
# I/Ddem+ VL NSg/I/J/C/Dq NSg/I/C/Ddem NPl+ NPr/VX NSg/VX .
|
||||
> That solution can do .
|
||||
# NSg/I/C/Ddem NSg+ NPr/VX NSg/VX .
|
||||
>
|
||||
#
|
||||
> We can do this !
|
||||
|
||||
@@ -39,7 +39,7 @@ struct Args {
|
||||
|
||||
// Setting worker threads to four means the process will use about five threads total
|
||||
// This is because worker threads do not include blocking threads
|
||||
#[tokio::main(worker_threads = 4)]
|
||||
#[tokio::main(worker_threads = 1)]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let subscriber = FmtSubscriber::builder()
|
||||
.map_writer(move |_| stderr)
|
||||
|
||||
@@ -15,8 +15,13 @@ serde = { version = "1.0.219", features = ["derive"] }
|
||||
is-macro = "0.3.7"
|
||||
rayon = { version = "1.10.0", optional = true }
|
||||
rand = { version = "0.9.1", optional = true }
|
||||
burn = { version = "0.18.0", default-features = false, features = ["std"] }
|
||||
burn-ndarray = { version = "0.18.0", default-features = false }
|
||||
serde_json = "1.0.140"
|
||||
itertools = "0.14.0"
|
||||
lru = "0.16.0"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
threaded = ["dep:rayon"]
|
||||
training = ["dep:rand"]
|
||||
training = ["dep:rand", "burn/train", "burn/autodiff"]
|
||||
|
||||
399
harper-pos-utils/src/chunker/burn_chunker.rs
Normal file
399
harper-pos-utils/src/chunker/burn_chunker.rs
Normal file
@@ -0,0 +1,399 @@
|
||||
use crate::{UPOS, chunker::Chunker};
|
||||
#[cfg(feature = "training")]
|
||||
use burn::backend::Autodiff;
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
use burn::nn::loss::{MseLoss, Reduction};
|
||||
use burn::nn::{Dropout, DropoutConfig};
|
||||
#[cfg(feature = "training")]
|
||||
use burn::optim::{GradientsParams, Optimizer};
|
||||
use burn::record::{FullPrecisionSettings, NamedMpkBytesRecorder, NamedMpkFileRecorder, Recorder};
|
||||
use burn::tensor::TensorData;
|
||||
#[cfg(feature = "training")]
|
||||
use burn::tensor::backend::AutodiffBackend;
|
||||
|
||||
use burn::{
|
||||
module::Module,
|
||||
nn::{BiLstmConfig, EmbeddingConfig, LinearConfig},
|
||||
tensor::{Int, Tensor, backend::Backend},
|
||||
};
|
||||
use burn_ndarray::{NdArray, NdArrayDevice};
|
||||
use hashbrown::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
const UNK_IDX: usize = 1;
|
||||
|
||||
#[derive(Module, Debug)]
|
||||
struct NpModel<B: Backend> {
|
||||
embedding_words: burn::nn::Embedding<B>,
|
||||
embedding_upos: burn::nn::Embedding<B>,
|
||||
lstm: burn::nn::BiLstm<B>,
|
||||
linear_out: burn::nn::Linear<B>,
|
||||
dropout: Dropout,
|
||||
}
|
||||
|
||||
impl<B: Backend> NpModel<B> {
|
||||
fn new(vocab: usize, word_embed_dim: usize, dropout: f32, device: &B::Device) -> Self {
|
||||
let upos_embed = 8;
|
||||
let total_embed = word_embed_dim + upos_embed;
|
||||
|
||||
Self {
|
||||
embedding_words: EmbeddingConfig::new(vocab, word_embed_dim).init(device),
|
||||
embedding_upos: EmbeddingConfig::new(20, upos_embed).init(device),
|
||||
lstm: BiLstmConfig::new(total_embed, total_embed, false).init(device),
|
||||
// Multiply by two because the BiLSTM emits double the hidden parameters
|
||||
linear_out: LinearConfig::new(total_embed * 2, 1).init(device),
|
||||
dropout: DropoutConfig::new(dropout as f64).init(),
|
||||
}
|
||||
}
|
||||
|
||||
fn forward(
|
||||
&self,
|
||||
word_tens: Tensor<B, 2, Int>,
|
||||
tag_tens: Tensor<B, 2, Int>,
|
||||
use_dropout: bool,
|
||||
) -> Tensor<B, 2> {
|
||||
let word_embed = self.embedding_words.forward(word_tens);
|
||||
let tag_embed = self.embedding_upos.forward(tag_tens);
|
||||
|
||||
let mut x = Tensor::cat(vec![word_embed, tag_embed], 2);
|
||||
|
||||
if use_dropout {
|
||||
x = self.dropout.forward(x);
|
||||
}
|
||||
|
||||
let (mut x, _) = self.lstm.forward(x, None);
|
||||
|
||||
if use_dropout {
|
||||
x = self.dropout.forward(x);
|
||||
}
|
||||
|
||||
let x = self.linear_out.forward(x);
|
||||
x.squeeze::<2>(2)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BurnChunker<B: Backend> {
|
||||
vocab: HashMap<String, usize>,
|
||||
model: NpModel<B>,
|
||||
device: B::Device,
|
||||
}
|
||||
|
||||
impl<B: Backend> BurnChunker<B> {
|
||||
fn idx(&self, tok: &str) -> usize {
|
||||
*self.vocab.get(tok).unwrap_or(&UNK_IDX)
|
||||
}
|
||||
|
||||
fn to_tensors(
|
||||
&self,
|
||||
sent: &[String],
|
||||
tags: &[Option<UPOS>],
|
||||
) -> (Tensor<B, 2, Int>, Tensor<B, 2, Int>) {
|
||||
// Interleave with UPOS tags
|
||||
let idxs: Vec<_> = sent.iter().map(|t| self.idx(t) as i32).collect();
|
||||
|
||||
let upos: Vec<_> = tags
|
||||
.iter()
|
||||
.map(|t| t.map(|o| o as i32 + 2).unwrap_or(1))
|
||||
.collect();
|
||||
|
||||
let word_tensor =
|
||||
Tensor::<B, 1, Int>::from_data(TensorData::from(idxs.as_slice()), &self.device)
|
||||
.reshape([1, sent.len()]);
|
||||
|
||||
let tag_tensor =
|
||||
Tensor::<B, 1, Int>::from_data(TensorData::from(upos.as_slice()), &self.device)
|
||||
.reshape([1, sent.len()]);
|
||||
|
||||
(word_tensor, tag_tensor)
|
||||
}
|
||||
|
||||
pub fn save_to(&self, dir: impl AsRef<Path>) {
|
||||
let dir = dir.as_ref();
|
||||
std::fs::create_dir_all(dir).unwrap();
|
||||
|
||||
let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::new();
|
||||
self.model
|
||||
.clone()
|
||||
.save_file(dir.join("model.mpk"), &recorder)
|
||||
.unwrap();
|
||||
|
||||
let vocab_bytes = serde_json::to_vec(&self.vocab).unwrap();
|
||||
std::fs::write(dir.join("vocab.json"), vocab_bytes).unwrap();
|
||||
}
|
||||
|
||||
pub fn load_from_bytes(
|
||||
model_bytes: impl AsRef<[u8]>,
|
||||
vocab_bytes: impl AsRef<[u8]>,
|
||||
embed_dim: usize,
|
||||
dropout: f32,
|
||||
device: B::Device,
|
||||
) -> Self {
|
||||
let vocab: HashMap<String, usize> = serde_json::from_slice(vocab_bytes.as_ref()).unwrap();
|
||||
|
||||
let recorder = NamedMpkBytesRecorder::<FullPrecisionSettings>::new();
|
||||
|
||||
let owned_data = model_bytes.as_ref().to_vec();
|
||||
let record = recorder.load(owned_data, &device).unwrap();
|
||||
|
||||
let model = NpModel::new(vocab.len(), embed_dim, dropout, &device);
|
||||
let model = model.load_record(record);
|
||||
|
||||
Self {
|
||||
vocab,
|
||||
model,
|
||||
device,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
struct ExtractedSentences(
|
||||
Vec<Vec<String>>,
|
||||
Vec<Vec<Option<UPOS>>>,
|
||||
Vec<Vec<bool>>,
|
||||
HashMap<String, usize>,
|
||||
);
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl<B: Backend + AutodiffBackend> BurnChunker<B> {
|
||||
fn to_label(&self, labels: &[bool]) -> Tensor<B, 2> {
|
||||
let ys: Vec<_> = labels.iter().map(|b| if *b { 1. } else { 0. }).collect();
|
||||
|
||||
Tensor::<B, 1, _>::from_data(TensorData::from(ys.as_slice()), &self.device)
|
||||
.reshape([1, labels.len()])
|
||||
}
|
||||
|
||||
pub fn train(
|
||||
training_files: &[impl AsRef<Path>],
|
||||
test_file: &impl AsRef<Path>,
|
||||
word_embed_dim: usize,
|
||||
dropout: f32,
|
||||
epochs: usize,
|
||||
lr: f64,
|
||||
device: B::Device,
|
||||
) -> Self {
|
||||
use burn::tensor::cast::ToElement;
|
||||
|
||||
println!("Preparing datasets...");
|
||||
let ExtractedSentences(sents, tags, labs, vocab) =
|
||||
Self::extract_sents_from_files(training_files);
|
||||
|
||||
println!("Preparing model and training config...");
|
||||
|
||||
let mut model = NpModel::<B>::new(vocab.len(), word_embed_dim, dropout, &device);
|
||||
let opt_config = burn::optim::AdamConfig::new();
|
||||
let mut opt = opt_config.init();
|
||||
|
||||
let util = BurnChunker {
|
||||
vocab: vocab.clone(),
|
||||
model: model.clone(),
|
||||
device: device.clone(),
|
||||
};
|
||||
|
||||
let loss_fn = MseLoss::new();
|
||||
let mut last_score = 0.;
|
||||
|
||||
println!("Training...");
|
||||
|
||||
for _ in 0..epochs {
|
||||
let mut total_loss = 0.;
|
||||
let mut total_tokens = 0;
|
||||
let mut total_correct: usize = 0;
|
||||
|
||||
for (i, ((x, w), y)) in sents.iter().zip(tags.iter()).zip(labs.iter()).enumerate() {
|
||||
let (word_tens, tag_tens) = util.to_tensors(x, w);
|
||||
let y_tensor = util.to_label(y);
|
||||
|
||||
let logits = model.forward(word_tens, tag_tens, true);
|
||||
total_correct += logits
|
||||
.to_data()
|
||||
.iter()
|
||||
.map(|p: f32| p > 0.5)
|
||||
.zip(y)
|
||||
.map(|(a, b)| if a == *b { 1 } else { 0 })
|
||||
.sum::<usize>();
|
||||
|
||||
let loss = loss_fn.forward(logits, y_tensor, Reduction::Mean);
|
||||
|
||||
let grads = loss.backward();
|
||||
let grads = GradientsParams::from_grads(grads, &model);
|
||||
|
||||
model = opt.step(lr, model, grads);
|
||||
|
||||
total_loss += loss.into_scalar().to_f64();
|
||||
total_tokens += x.len();
|
||||
|
||||
if i % 1000 == 0 {
|
||||
println!("{i}/{}", sents.len());
|
||||
}
|
||||
}
|
||||
|
||||
println!(
|
||||
"Average loss for epoch: {}",
|
||||
total_loss / sents.len() as f64 * 100.
|
||||
);
|
||||
|
||||
println!(
|
||||
"{}% correct in training dataset",
|
||||
total_correct as f32 / total_tokens as f32 * 100.
|
||||
);
|
||||
|
||||
let score = util.score_model(&model, test_file);
|
||||
println!("{}% correct in test dataset", score * 100.);
|
||||
|
||||
if score < last_score {
|
||||
println!("Overfitting detected. Stopping...");
|
||||
break;
|
||||
}
|
||||
|
||||
last_score = score;
|
||||
}
|
||||
|
||||
Self {
|
||||
vocab,
|
||||
model,
|
||||
device,
|
||||
}
|
||||
}
|
||||
|
||||
fn score_model(&self, model: &NpModel<B>, dataset: &impl AsRef<Path>) -> f32 {
|
||||
let ExtractedSentences(sents, tags, labs, _) = Self::extract_sents_from_files(&[dataset]);
|
||||
|
||||
let mut total_tokens = 0;
|
||||
let mut total_correct: usize = 0;
|
||||
|
||||
for ((x, w), y) in sents.iter().zip(tags.iter()).zip(labs.iter()) {
|
||||
let (word_tens, tag_tens) = self.to_tensors(x, w);
|
||||
|
||||
let logits = model.forward(word_tens, tag_tens, false);
|
||||
total_correct += logits
|
||||
.to_data()
|
||||
.iter()
|
||||
.map(|p: f32| p > 0.5)
|
||||
.zip(y)
|
||||
.map(|(a, b)| if a == *b { 1 } else { 0 })
|
||||
.sum::<usize>();
|
||||
|
||||
total_tokens += x.len();
|
||||
}
|
||||
|
||||
total_correct as f32 / total_tokens as f32
|
||||
}
|
||||
|
||||
fn extract_sents_from_files(files: &[impl AsRef<Path>]) -> ExtractedSentences {
|
||||
use super::np_extraction::locate_noun_phrases_in_sent;
|
||||
use crate::conllu_utils::iter_sentences_in_conllu;
|
||||
|
||||
let mut vocab: HashMap<String, usize> = HashMap::new();
|
||||
vocab.insert("<UNK>".into(), UNK_IDX);
|
||||
|
||||
let mut sents: Vec<Vec<String>> = Vec::new();
|
||||
let mut sent_tags: Vec<Vec<Option<UPOS>>> = Vec::new();
|
||||
let mut labs: Vec<Vec<bool>> = Vec::new();
|
||||
|
||||
const CONTRACTIONS: &[&str] = &["sn't", "n't", "'ll", "'ve", "'re", "'d", "'m", "'s"];
|
||||
|
||||
for file in files {
|
||||
for sent in iter_sentences_in_conllu(file) {
|
||||
let spans = locate_noun_phrases_in_sent(&sent);
|
||||
|
||||
let mut original_mask = vec![false; sent.tokens.len()];
|
||||
for span in spans {
|
||||
for i in span {
|
||||
original_mask[i] = true;
|
||||
}
|
||||
}
|
||||
|
||||
let mut toks: Vec<String> = Vec::new();
|
||||
let mut tags: Vec<Option<UPOS>> = Vec::new();
|
||||
let mut mask: Vec<bool> = Vec::new();
|
||||
|
||||
for (idx, tok) in sent.tokens.iter().enumerate() {
|
||||
let is_contraction = CONTRACTIONS.contains(&&tok.form[..]);
|
||||
if is_contraction && !toks.is_empty() {
|
||||
let prev_tok = toks.pop().unwrap();
|
||||
let prev_mask = mask.pop().unwrap();
|
||||
toks.push(format!("{prev_tok}{}", tok.form));
|
||||
mask.push(prev_mask || original_mask[idx]);
|
||||
} else {
|
||||
toks.push(tok.form.clone());
|
||||
tags.push(tok.upos.and_then(UPOS::from_conllu));
|
||||
mask.push(original_mask[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
for t in &toks {
|
||||
if !vocab.contains_key(t) {
|
||||
let next = vocab.len();
|
||||
vocab.insert(t.clone(), next);
|
||||
}
|
||||
}
|
||||
|
||||
sents.push(toks);
|
||||
sent_tags.push(tags);
|
||||
labs.push(mask);
|
||||
}
|
||||
}
|
||||
|
||||
ExtractedSentences(sents, sent_tags, labs, vocab)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
pub type BurnChunkerCpu = BurnChunker<burn::backend::Autodiff<NdArray>>;
|
||||
|
||||
#[cfg(not(feature = "training"))]
|
||||
pub type BurnChunkerCpu = BurnChunker<NdArray>;
|
||||
|
||||
impl BurnChunkerCpu {
|
||||
pub fn load_from_bytes_cpu(
|
||||
model_bytes: impl AsRef<[u8]>,
|
||||
vocab_bytes: impl AsRef<[u8]>,
|
||||
embed_dim: usize,
|
||||
dropout: f32,
|
||||
) -> Self {
|
||||
Self::load_from_bytes(
|
||||
model_bytes,
|
||||
vocab_bytes,
|
||||
embed_dim,
|
||||
dropout,
|
||||
NdArrayDevice::Cpu,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl BurnChunkerCpu {
|
||||
pub fn train_cpu(
|
||||
training_files: &[impl AsRef<Path>],
|
||||
test_file: &impl AsRef<Path>,
|
||||
embed_dim: usize,
|
||||
dropout: f32,
|
||||
epochs: usize,
|
||||
lr: f64,
|
||||
) -> Self {
|
||||
BurnChunker::<Autodiff<NdArray>>::train(
|
||||
training_files,
|
||||
test_file,
|
||||
embed_dim,
|
||||
dropout,
|
||||
epochs,
|
||||
lr,
|
||||
NdArrayDevice::Cpu,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: Backend> Chunker for BurnChunker<B> {
|
||||
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
|
||||
// Solves a divide-by-zero error in the linear layer.
|
||||
if sentence.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let (word_tens, tag_tens) = self.to_tensors(sentence, tags);
|
||||
let prob = self.model.forward(word_tens, tag_tens, false);
|
||||
prob.to_data().iter().map(|p: f32| p > 0.5).collect()
|
||||
}
|
||||
}
|
||||
61
harper-pos-utils/src/chunker/cached_chunker.rs
Normal file
61
harper-pos-utils/src/chunker/cached_chunker.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use lru::LruCache;
|
||||
use std::hash::Hash;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use super::Chunker;
|
||||
use crate::UPOS;
|
||||
|
||||
/// Wraps any chunker implementation to add an LRU Cache.
|
||||
/// Useful for incremental lints.
|
||||
pub struct CachedChunker<C: Chunker> {
|
||||
inner: C,
|
||||
cache: Mutex<LruCache<CacheKey, Vec<bool>>>,
|
||||
}
|
||||
|
||||
impl<C: Chunker> CachedChunker<C> {
|
||||
pub fn new(inner: C, capacity: NonZeroUsize) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
cache: Mutex::new(LruCache::new(capacity)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: Chunker> Chunker for CachedChunker<C> {
|
||||
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
|
||||
let key = CacheKey::new(sentence, tags);
|
||||
|
||||
// Attempt a cache hit.
|
||||
// We put this in the block so `read` gets dropped as early as possible.
|
||||
if let Ok(mut read) = self.cache.try_lock() {
|
||||
if let Some(result) = read.get(&key) {
|
||||
return result.clone();
|
||||
}
|
||||
};
|
||||
|
||||
// We don't want to hold the lock since it may take a while to run the chunker.
|
||||
let result = self.inner.chunk_sentence(sentence, tags);
|
||||
|
||||
if let Ok(mut cache) = self.cache.try_lock() {
|
||||
cache.put(key, result.clone());
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Hash, PartialEq, Eq)]
|
||||
struct CacheKey {
|
||||
sentence: Vec<String>,
|
||||
tags: Vec<Option<UPOS>>,
|
||||
}
|
||||
|
||||
impl CacheKey {
|
||||
fn new(sentence: &[String], tags: &[Option<UPOS>]) -> Self {
|
||||
Self {
|
||||
sentence: sentence.to_vec(),
|
||||
tags: tags.to_vec(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,15 @@
|
||||
use crate::UPOS;
|
||||
|
||||
mod brill_chunker;
|
||||
mod burn_chunker;
|
||||
mod cached_chunker;
|
||||
#[cfg(feature = "training")]
|
||||
mod np_extraction;
|
||||
mod upos_freq_dict;
|
||||
|
||||
pub use brill_chunker::BrillChunker;
|
||||
pub use burn_chunker::{BurnChunker, BurnChunkerCpu};
|
||||
pub use cached_chunker::CachedChunker;
|
||||
pub use upos_freq_dict::UPOSFreqDict;
|
||||
|
||||
/// An implementer of this trait is capable of identifying the noun phrases in a provided sentence.
|
||||
|
||||
@@ -7,6 +7,8 @@ mod upos;
|
||||
#[cfg(feature = "training")]
|
||||
mod word_counter;
|
||||
|
||||
pub use chunker::{BrillChunker, Chunker, UPOSFreqDict};
|
||||
pub use chunker::{
|
||||
BrillChunker, BurnChunker, BurnChunkerCpu, CachedChunker, Chunker, UPOSFreqDict,
|
||||
};
|
||||
pub use tagger::{BrillTagger, FreqDict, FreqDictBuilder, Tagger};
|
||||
pub use upos::{UPOS, UPOSIter};
|
||||
|
||||
@@ -18,4 +18,5 @@ once_cell = "1.21.3"
|
||||
serde-wasm-bindgen = "0.6.5"
|
||||
serde_json = "1.0.141"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
getrandom = { version = "0.3.3", default-features = false, features = ["wasm_js"] }
|
||||
harper-stats = { path = "../harper-stats", version = "0.54.0", features = ["js"] }
|
||||
|
||||
12
justfile
12
justfile
@@ -3,10 +3,11 @@ format:
|
||||
cargo fmt
|
||||
pnpm format
|
||||
|
||||
# Build the WebAssembly for a specific target (usually either `web` or `bundler`)
|
||||
# Build the WebAssembly module
|
||||
build-wasm:
|
||||
cd "{{justfile_directory()}}/harper-wasm" && wasm-pack build --target web
|
||||
|
||||
#!/usr/bin/env bash
|
||||
cd "{{justfile_directory()}}/harper-wasm"
|
||||
RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack build --target web
|
||||
|
||||
# Build `harper.js` with all size optimizations available.
|
||||
build-harperjs: build-wasm
|
||||
@@ -590,6 +591,9 @@ newest-dict-changes *numCommits:
|
||||
});
|
||||
});
|
||||
|
||||
getnps a:
|
||||
cargo run --bin harper-cli -- nominal-phrases "{{a}}"
|
||||
|
||||
# Suggest annotations for a potential new property annotation
|
||||
suggestannotation input:
|
||||
#! /usr/bin/env node
|
||||
@@ -630,4 +634,4 @@ suggestannotation input:
|
||||
} else {
|
||||
console.log(`None of the characters of "${input}" are available to use for new annotations, and none of them are OK to be moved to make way for new annotations.`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,16 +35,16 @@ chrome.runtime.onInstalled.addListener((details) => {
|
||||
}
|
||||
});
|
||||
|
||||
let linter: LocalLinter;
|
||||
|
||||
getDialect().then(setDialect);
|
||||
|
||||
chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
|
||||
handleRequest(request).then(sendResponse);
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
let linter: LocalLinter;
|
||||
|
||||
getDialect().then(setDialect);
|
||||
|
||||
async function enableDefaultDomains() {
|
||||
const defaultEnabledDomains = [
|
||||
'chatgpt.com',
|
||||
@@ -93,6 +93,8 @@ async function enableDefaultDomains() {
|
||||
enableDefaultDomains();
|
||||
|
||||
function handleRequest(message: Request): Promise<Response> {
|
||||
console.log(`Handling ${message.kind} request`);
|
||||
|
||||
switch (message.kind) {
|
||||
case 'lint':
|
||||
return handleLint(message);
|
||||
|
||||
@@ -2,9 +2,18 @@ import '@webcomponents/custom-elements';
|
||||
import $ from 'jquery';
|
||||
import { isVisible, leafNodes } from '../domUtils';
|
||||
import LintFramework from '../LintFramework';
|
||||
import ProtocolClient from '../ProtocolClient';
|
||||
|
||||
const fw = new LintFramework();
|
||||
|
||||
const keepAliveCallback = () => {
|
||||
ProtocolClient.lint('', 'example.com');
|
||||
|
||||
setTimeout(keepAliveCallback, 400);
|
||||
};
|
||||
|
||||
keepAliveCallback();
|
||||
|
||||
function scan() {
|
||||
$('textarea:visible').each(function () {
|
||||
if (this.getAttribute('data-enable-grammarly') == 'false' || this.disabled || this.readOnly) {
|
||||
|
||||
@@ -2,4 +2,11 @@ import path from 'path';
|
||||
import { createFixture } from 'playwright-webextext';
|
||||
|
||||
const pathToExtension = path.join(import.meta.dirname, '../build');
|
||||
export const { test, expect } = createFixture(pathToExtension);
|
||||
const { test, expect } = createFixture(pathToExtension);
|
||||
|
||||
test.afterEach(async ({ context }) => {
|
||||
const bg = context.serviceWorkers()[0] ?? context.backgroundPages()[0];
|
||||
if (bg) await bg.evaluate(() => chrome?.storage?.local.clear?.());
|
||||
});
|
||||
|
||||
export { test, expect };
|
||||
|
||||
@@ -17,6 +17,9 @@ testCanIgnoreTextareaSuggestion(TEST_PAGE_URL);
|
||||
test('Wraps correctly', async ({ page }) => {
|
||||
await page.goto(TEST_PAGE_URL);
|
||||
|
||||
await page.waitForTimeout(2000);
|
||||
await page.reload();
|
||||
|
||||
const editor = getTextarea(page);
|
||||
await replaceEditorContent(
|
||||
editor,
|
||||
@@ -34,6 +37,9 @@ test('Wraps correctly', async ({ page }) => {
|
||||
test('Scrolls correctly', async ({ page }) => {
|
||||
await page.goto(TEST_PAGE_URL);
|
||||
|
||||
await page.waitForTimeout(2000);
|
||||
await page.reload();
|
||||
|
||||
const editor = getTextarea(page);
|
||||
await replaceEditorContent(
|
||||
editor,
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
import { expect, test } from './fixtures';
|
||||
import { clickHarperHighlight, getLexicalEditor, replaceEditorContent } from './testUtils';
|
||||
import {
|
||||
clickHarperHighlight,
|
||||
getLexicalEditor,
|
||||
randomString,
|
||||
replaceEditorContent,
|
||||
} from './testUtils';
|
||||
|
||||
const TEST_PAGE_URL = 'https://playground.lexical.dev/';
|
||||
|
||||
@@ -27,7 +32,8 @@ test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.goto(TEST_PAGE_URL);
|
||||
const lexical = getLexicalEditor(page);
|
||||
|
||||
await replaceEditorContent(lexical, 'This is an test.');
|
||||
const cacheSalt = randomString(5);
|
||||
await replaceEditorContent(lexical, cacheSalt);
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
@@ -37,6 +43,6 @@ test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Nothing should change.
|
||||
expect(lexical).toContainText('This is an test');
|
||||
expect(lexical).toContainText(cacheSalt);
|
||||
expect(await clickHarperHighlight(page)).toBe(false);
|
||||
});
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
import { expect, test } from './fixtures';
|
||||
import { clickHarperHighlight, getProseMirrorEditor, replaceEditorContent } from './testUtils';
|
||||
import {
|
||||
clickHarperHighlight,
|
||||
getProseMirrorEditor,
|
||||
randomString,
|
||||
replaceEditorContent,
|
||||
} from './testUtils';
|
||||
|
||||
const TEST_PAGE_URL = 'https://prosemirror.net/';
|
||||
|
||||
@@ -27,7 +32,8 @@ test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.goto(TEST_PAGE_URL);
|
||||
const pm = getProseMirrorEditor(page);
|
||||
|
||||
await replaceEditorContent(pm, 'This is an test.');
|
||||
const cacheSalt = randomString(5);
|
||||
await replaceEditorContent(pm, cacheSalt);
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
@@ -37,6 +43,6 @@ test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Nothing should change.
|
||||
expect(pm).toContainText('This is an test');
|
||||
expect(pm).toContainText(cacheSalt);
|
||||
expect(await clickHarperHighlight(page)).toBe(false);
|
||||
});
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
import { expect, test } from './fixtures';
|
||||
import { clickHarperHighlight, getSlateEditor, replaceEditorContent } from './testUtils';
|
||||
import {
|
||||
clickHarperHighlight,
|
||||
getSlateEditor,
|
||||
randomString,
|
||||
replaceEditorContent,
|
||||
} from './testUtils';
|
||||
|
||||
const TEST_PAGE_URL = 'https://slatejs.org';
|
||||
|
||||
@@ -27,7 +32,8 @@ test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.goto(TEST_PAGE_URL);
|
||||
const slate = getSlateEditor(page);
|
||||
|
||||
await replaceEditorContent(slate, 'This is an test.');
|
||||
const cacheSalt = randomString(5);
|
||||
await replaceEditorContent(slate, cacheSalt);
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
@@ -37,6 +43,6 @@ test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Nothing should change.
|
||||
expect(slate).toContainText('This is an test');
|
||||
expect(slate).toContainText(cacheSalt);
|
||||
expect(await clickHarperHighlight(page)).toBe(false);
|
||||
});
|
||||
|
||||
@@ -2,6 +2,15 @@ import type { Locator, Page } from '@playwright/test';
|
||||
import type { Box } from '../src/Box';
|
||||
import { expect, test } from './fixtures';
|
||||
|
||||
export function randomString(length: number): string {
|
||||
const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
|
||||
let result = '';
|
||||
for (let i = 0; i < length; i++) {
|
||||
result += chars.charAt(Math.floor(Math.random() * chars.length));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Locate the [`Slate`](https://www.slatejs.org/examples/richtext) editor on the page. */
|
||||
export function getSlateEditor(page: Page): Locator {
|
||||
return page.locator('[data-slate-editor="true"]');
|
||||
@@ -59,9 +68,12 @@ export function getTextarea(page: Page): Locator {
|
||||
}
|
||||
|
||||
export async function testBasicSuggestionTextarea(testPageUrl: string) {
|
||||
test('Can apply basic suggestion.', async ({ page }) => {
|
||||
test('Can apply basic suggestion.', async ({ page, context }) => {
|
||||
await page.goto(testPageUrl);
|
||||
|
||||
await page.waitForTimeout(2000);
|
||||
await page.reload();
|
||||
|
||||
const editor = getTextarea(page);
|
||||
await replaceEditorContent(editor, 'This is an test');
|
||||
|
||||
@@ -80,8 +92,13 @@ export async function testCanIgnoreTextareaSuggestion(testPageUrl: string) {
|
||||
test('Can ignore suggestion.', async ({ page }) => {
|
||||
await page.goto(testPageUrl);
|
||||
|
||||
await page.waitForTimeout(2000);
|
||||
await page.reload();
|
||||
|
||||
const editor = getTextarea(page);
|
||||
await replaceEditorContent(editor, 'This is an test');
|
||||
|
||||
const cacheSalt = randomString(5);
|
||||
await replaceEditorContent(editor, cacheSalt);
|
||||
|
||||
await page.waitForTimeout(6000);
|
||||
|
||||
@@ -91,7 +108,7 @@ export async function testCanIgnoreTextareaSuggestion(testPageUrl: string) {
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Nothing should change.
|
||||
expect(editor).toHaveValue('This is an test');
|
||||
expect(editor).toHaveValue(cacheSalt);
|
||||
expect(await clickHarperHighlight(page)).toBe(false);
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user