| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- from __future__ import annotations
- from abc import ABC, abstractmethod
- from typing import Iterable
- ENGLISH_STOP_WORDS = [
- "!",
- '"',
- "''",
- "``",
- "#",
- "$",
- "%",
- "&",
- "'",
- "(",
- ")",
- "*",
- "+",
- ",",
- "-",
- ".",
- "/",
- ":",
- ";",
- "<",
- "=",
- ">",
- "?",
- "@",
- "[",
- "\\",
- "]",
- "^",
- "_",
- "`",
- "{",
- "|",
- "}",
- "~",
- "a",
- "about",
- "above",
- "across",
- "after",
- "afterwards",
- "again",
- "against",
- "ain",
- "all",
- "almost",
- "alone",
- "along",
- "already",
- "also",
- "although",
- "always",
- "am",
- "among",
- "amongst",
- "amoungst", # codespell:ignore
- "amount",
- "an",
- "and",
- "another",
- "any",
- "anyhow",
- "anyone",
- "anything",
- "anyway",
- "anywhere",
- "are",
- "aren",
- "around",
- "as",
- "at",
- "back",
- "be",
- "became",
- "because",
- "become",
- "becomes",
- "becoming",
- "been",
- "before",
- "beforehand",
- "behind",
- "being",
- "below",
- "beside",
- "besides",
- "between",
- "beyond",
- "bill",
- "both",
- "bottom",
- "but",
- "by",
- "call",
- "can",
- "cannot",
- "cant", # codespell:ignore
- "co",
- "con",
- "could",
- "couldn",
- "couldnt",
- "cry",
- "d",
- "de",
- "describe",
- "detail",
- "did",
- "didn",
- "do",
- "does",
- "doesn",
- "doing",
- "don",
- "done",
- "down",
- "due",
- "during",
- "each",
- "eg",
- "eight",
- "either",
- "eleven",
- "else",
- "elsewhere",
- "empty",
- "enough",
- "etc",
- "even",
- "ever",
- "every",
- "everyone",
- "everything",
- "everywhere",
- "except",
- "few",
- "fifteen",
- "fifty",
- "fill",
- "find",
- "fire",
- "first",
- "five",
- "for",
- "former",
- "formerly",
- "forty",
- "found",
- "four",
- "from",
- "front",
- "full",
- "further",
- "get",
- "give",
- "go",
- "had",
- "hadn",
- "has",
- "hasn",
- "hasnt",
- "have",
- "haven",
- "having",
- "he",
- "hence",
- "her",
- "here",
- "hereafter",
- "hereby",
- "herein",
- "hereupon",
- "hers",
- "herself",
- "him",
- "himself",
- "his",
- "how",
- "however",
- "hundred",
- "i",
- "ie",
- "if",
- "in",
- "inc",
- "indeed",
- "interest",
- "into",
- "is",
- "isn",
- "it",
- "its",
- "itself",
- "just",
- "keep",
- "last",
- "latter",
- "latterly",
- "least",
- "less",
- "ll",
- "ltd",
- "m",
- "ma",
- "made",
- "many",
- "may",
- "me",
- "meanwhile",
- "might",
- "mightn",
- "mill",
- "mine",
- "more",
- "moreover",
- "most",
- "mostly",
- "move",
- "much",
- "must",
- "mustn",
- "my",
- "myself",
- "name",
- "namely",
- "needn",
- "neither",
- "never",
- "nevertheless",
- "next",
- "nine",
- "no",
- "nobody",
- "none",
- "noone", # codespell:ignore
- "nor",
- "not",
- "nothing",
- "now",
- "nowhere",
- "o",
- "of",
- "off",
- "often",
- "on",
- "once",
- "one",
- "only",
- "onto",
- "or",
- "other",
- "others",
- "otherwise",
- "our",
- "ours",
- "ourselves",
- "out",
- "over",
- "own",
- "part",
- "per",
- "perhaps",
- "please",
- "put",
- "rather",
- "re",
- "s",
- "same",
- "see",
- "seem",
- "seemed",
- "seeming",
- "seems",
- "serious",
- "several",
- "shan",
- "she",
- "should",
- "shouldn",
- "show",
- "side",
- "since",
- "sincere",
- "six",
- "sixty",
- "so",
- "some",
- "somehow",
- "someone",
- "something",
- "sometime",
- "sometimes",
- "somewhere",
- "still",
- "such",
- "system",
- "t",
- "take",
- "ten",
- "than",
- "that",
- "the",
- "their",
- "theirs",
- "them",
- "themselves",
- "then",
- "thence",
- "there",
- "thereafter",
- "thereby",
- "therefore",
- "therein",
- "thereupon",
- "these",
- "they",
- "thick",
- "thin",
- "third",
- "this",
- "those",
- "though",
- "three",
- "through",
- "throughout",
- "thru",
- "thus",
- "to",
- "together",
- "too",
- "top",
- "toward",
- "towards",
- "twelve",
- "twenty",
- "two",
- "un",
- "under",
- "until",
- "up",
- "upon",
- "us",
- "ve",
- "very",
- "via",
- "was",
- "wasn", # codespell:ignore
- "we",
- "well",
- "were",
- "weren",
- "what",
- "whatever",
- "when",
- "whence",
- "whenever",
- "where",
- "whereafter",
- "whereas",
- "whereby",
- "wherein",
- "whereupon",
- "wherever",
- "whether",
- "which",
- "while",
- "whither",
- "who",
- "whoever",
- "whole",
- "whom",
- "whose",
- "why",
- "will",
- "with",
- "within",
- "without",
- "won",
- "would",
- "wouldn",
- "y",
- "yet",
- "you",
- "your",
- "yours",
- "yourself",
- "yourselves",
- ]
- class WordTokenizer(ABC):
- @abstractmethod
- def set_vocab(self, vocab: Iterable[str]):
- pass
- @abstractmethod
- def get_vocab(self, vocab: Iterable[str]):
- pass
- @abstractmethod
- def tokenize(self, text: str, **kwargs) -> list[int]:
- pass
- @abstractmethod
- def save(self, output_path: str):
- pass
- @staticmethod
- @abstractmethod
- def load(input_path: str):
- pass
|