PhraseTokenizer.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. from __future__ import annotations
  2. import collections
  3. import json
  4. import logging
  5. import os
  6. import string
  7. from typing import Iterable
  8. from transformers.utils.import_utils import NLTK_IMPORT_ERROR, is_nltk_available
  9. from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
  10. logger = logging.getLogger(__name__)
  11. class PhraseTokenizer(WordTokenizer):
  12. """Tokenizes the text with respect to existent phrases in the vocab.
  13. This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
  14. in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
  15. """
  16. def __init__(
  17. self,
  18. vocab: Iterable[str] = [],
  19. stop_words: Iterable[str] = ENGLISH_STOP_WORDS,
  20. do_lower_case: bool = False,
  21. ngram_separator: str = "_",
  22. max_ngram_length: int = 5,
  23. ):
  24. if not is_nltk_available():
  25. raise ImportError(NLTK_IMPORT_ERROR.format(self.__class__.__name__))
  26. self.stop_words = set(stop_words)
  27. self.do_lower_case = do_lower_case
  28. self.ngram_separator = ngram_separator
  29. self.max_ngram_length = max_ngram_length
  30. self.set_vocab(vocab)
  31. def get_vocab(self):
  32. return self.vocab
  33. def set_vocab(self, vocab: Iterable[str]):
  34. self.vocab = vocab
  35. self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
  36. # Check for ngram in vocab
  37. self.ngram_lookup = set()
  38. self.ngram_lengths = set()
  39. for word in vocab:
  40. if self.ngram_separator is not None and self.ngram_separator in word:
  41. # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other
  42. ngram_count = word.count(self.ngram_separator) + 1
  43. if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length:
  44. self.ngram_lookup.add(word)
  45. self.ngram_lengths.add(ngram_count)
  46. if len(vocab) > 0:
  47. logger.info(f"PhraseTokenizer - Phrase ngram lengths: {self.ngram_lengths}")
  48. logger.info(f"PhraseTokenizer - Num phrases: {len(self.ngram_lookup)}")
  49. def tokenize(self, text: str, **kwargs) -> list[int]:
  50. from nltk import word_tokenize
  51. tokens = word_tokenize(text, preserve_line=True)
  52. # phrase detection
  53. for ngram_len in sorted(self.ngram_lengths, reverse=True):
  54. idx = 0
  55. while idx <= len(tokens) - ngram_len:
  56. ngram = self.ngram_separator.join(tokens[idx : idx + ngram_len])
  57. if ngram in self.ngram_lookup:
  58. tokens[idx : idx + ngram_len] = [ngram]
  59. elif ngram.lower() in self.ngram_lookup:
  60. tokens[idx : idx + ngram_len] = [ngram.lower()]
  61. idx += 1
  62. # Map tokens to idx, filter stop words
  63. tokens_filtered = []
  64. for token in tokens:
  65. if token in self.stop_words:
  66. continue
  67. elif token in self.word2idx:
  68. tokens_filtered.append(self.word2idx[token])
  69. continue
  70. token = token.lower()
  71. if token in self.stop_words:
  72. continue
  73. elif token in self.word2idx:
  74. tokens_filtered.append(self.word2idx[token])
  75. continue
  76. token = token.strip(string.punctuation)
  77. if token in self.stop_words:
  78. continue
  79. elif len(token) > 0 and token in self.word2idx:
  80. tokens_filtered.append(self.word2idx[token])
  81. continue
  82. return tokens_filtered
  83. def save(self, output_path: str):
  84. with open(os.path.join(output_path, "phrasetokenizer_config.json"), "w") as fOut:
  85. json.dump(
  86. {
  87. "vocab": list(self.word2idx.keys()),
  88. "stop_words": list(self.stop_words),
  89. "do_lower_case": self.do_lower_case,
  90. "ngram_separator": self.ngram_separator,
  91. "max_ngram_length": self.max_ngram_length,
  92. },
  93. fOut,
  94. )
  95. @staticmethod
  96. def load(input_path: str):
  97. with open(os.path.join(input_path, "phrasetokenizer_config.json")) as fIn:
  98. config = json.load(fIn)
  99. return PhraseTokenizer(**config)