tokenization_bertweet.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. # coding=utf-8
  2. # Copyright (c) 2020, VinAI Research and the HuggingFace Inc. team.
  3. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """Tokenization classes for BERTweet"""
  17. import html
  18. import os
  19. import re
  20. from shutil import copyfile
  21. from typing import List, Optional, Tuple
  22. import regex
  23. from ...tokenization_utils import PreTrainedTokenizer
  24. from ...utils import logging
  25. logger = logging.get_logger(__name__)
  26. VOCAB_FILES_NAMES = {
  27. "vocab_file": "vocab.txt",
  28. "merges_file": "bpe.codes",
  29. }
  30. def get_pairs(word):
  31. """
  32. Return set of symbol pairs in a word.
  33. Word is represented as tuple of symbols (symbols being variable-length strings).
  34. """
  35. pairs = set()
  36. prev_char = word[0]
  37. for char in word[1:]:
  38. pairs.add((prev_char, char))
  39. prev_char = char
  40. pairs = set(pairs)
  41. return pairs
  42. class BertweetTokenizer(PreTrainedTokenizer):
  43. """
  44. Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.
  45. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
  46. this superclass for more information regarding those methods.
  47. Args:
  48. vocab_file (`str`):
  49. Path to the vocabulary file.
  50. merges_file (`str`):
  51. Path to the merges file.
  52. normalization (`bool`, *optional*, defaults to `False`):
  53. Whether or not to apply a normalization preprocess.
  54. bos_token (`str`, *optional*, defaults to `"<s>"`):
  55. The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
  56. <Tip>
  57. When building a sequence using special tokens, this is not the token that is used for the beginning of
  58. sequence. The token used is the `cls_token`.
  59. </Tip>
  60. eos_token (`str`, *optional*, defaults to `"</s>"`):
  61. The end of sequence token.
  62. <Tip>
  63. When building a sequence using special tokens, this is not the token that is used for the end of sequence.
  64. The token used is the `sep_token`.
  65. </Tip>
  66. sep_token (`str`, *optional*, defaults to `"</s>"`):
  67. The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
  68. sequence classification or for a text and a question for question answering. It is also used as the last
  69. token of a sequence built with special tokens.
  70. cls_token (`str`, *optional*, defaults to `"<s>"`):
  71. The classifier token which is used when doing sequence classification (classification of the whole sequence
  72. instead of per-token classification). It is the first token of the sequence when built with special tokens.
  73. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  74. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  75. token instead.
  76. pad_token (`str`, *optional*, defaults to `"<pad>"`):
  77. The token used for padding, for example when batching sequences of different lengths.
  78. mask_token (`str`, *optional*, defaults to `"<mask>"`):
  79. The token used for masking values. This is the token used when training this model with masked language
  80. modeling. This is the token which the model will try to predict.
  81. """
  82. vocab_files_names = VOCAB_FILES_NAMES
  83. def __init__(
  84. self,
  85. vocab_file,
  86. merges_file,
  87. normalization=False,
  88. bos_token="<s>",
  89. eos_token="</s>",
  90. sep_token="</s>",
  91. cls_token="<s>",
  92. unk_token="<unk>",
  93. pad_token="<pad>",
  94. mask_token="<mask>",
  95. **kwargs,
  96. ):
  97. try:
  98. from emoji import demojize
  99. self.demojizer = demojize
  100. except ImportError:
  101. logger.warning(
  102. "emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3"
  103. " install emoji==0.6.0"
  104. )
  105. self.demojizer = None
  106. self.vocab_file = vocab_file
  107. self.merges_file = merges_file
  108. self.encoder = {}
  109. self.encoder[str(bos_token)] = 0
  110. self.encoder[str(pad_token)] = 1
  111. self.encoder[str(eos_token)] = 2
  112. self.encoder[str(unk_token)] = 3
  113. self.add_from_file(vocab_file)
  114. self.decoder = {v: k for k, v in self.encoder.items()}
  115. with open(merges_file, encoding="utf-8") as merges_handle:
  116. merges = merges_handle.read().split("\n")[:-1]
  117. merges = [tuple(merge.split()[:-1]) for merge in merges]
  118. self.bpe_ranks = dict(zip(merges, range(len(merges))))
  119. self.cache = {}
  120. self.normalization = normalization
  121. self.tweetPreprocessor = TweetTokenizer()
  122. self.special_puncts = {"’": "'", "…": "..."}
  123. super().__init__(
  124. normalization=normalization,
  125. bos_token=bos_token,
  126. eos_token=eos_token,
  127. sep_token=sep_token,
  128. cls_token=cls_token,
  129. unk_token=unk_token,
  130. pad_token=pad_token,
  131. mask_token=mask_token,
  132. **kwargs,
  133. )
  134. def build_inputs_with_special_tokens(
  135. self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  136. ) -> List[int]:
  137. """
  138. Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
  139. adding special tokens. A BERTweet sequence has the following format:
  140. - single sequence: `<s> X </s>`
  141. - pair of sequences: `<s> A </s></s> B </s>`
  142. Args:
  143. token_ids_0 (`List[int]`):
  144. List of IDs to which the special tokens will be added.
  145. token_ids_1 (`List[int]`, *optional*):
  146. Optional second list of IDs for sequence pairs.
  147. Returns:
  148. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
  149. """
  150. if token_ids_1 is None:
  151. return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
  152. cls = [self.cls_token_id]
  153. sep = [self.sep_token_id]
  154. return cls + token_ids_0 + sep + sep + token_ids_1 + sep
  155. def get_special_tokens_mask(
  156. self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
  157. ) -> List[int]:
  158. """
  159. Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
  160. special tokens using the tokenizer `prepare_for_model` method.
  161. Args:
  162. token_ids_0 (`List[int]`):
  163. List of IDs.
  164. token_ids_1 (`List[int]`, *optional*):
  165. Optional second list of IDs for sequence pairs.
  166. already_has_special_tokens (`bool`, *optional*, defaults to `False`):
  167. Whether or not the token list is already formatted with special tokens for the model.
  168. Returns:
  169. `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  170. """
  171. if already_has_special_tokens:
  172. return super().get_special_tokens_mask(
  173. token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
  174. )
  175. if token_ids_1 is None:
  176. return [1] + ([0] * len(token_ids_0)) + [1]
  177. return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
  178. def create_token_type_ids_from_sequences(
  179. self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  180. ) -> List[int]:
  181. """
  182. Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
  183. not make use of token type ids, therefore a list of zeros is returned.
  184. Args:
  185. token_ids_0 (`List[int]`):
  186. List of IDs.
  187. token_ids_1 (`List[int]`, *optional*):
  188. Optional second list of IDs for sequence pairs.
  189. Returns:
  190. `List[int]`: List of zeros.
  191. """
  192. sep = [self.sep_token_id]
  193. cls = [self.cls_token_id]
  194. if token_ids_1 is None:
  195. return len(cls + token_ids_0 + sep) * [0]
  196. return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
  197. @property
  198. def vocab_size(self):
  199. return len(self.encoder)
  200. def get_vocab(self):
  201. return dict(self.encoder, **self.added_tokens_encoder)
  202. def bpe(self, token):
  203. if token in self.cache:
  204. return self.cache[token]
  205. word = tuple(token)
  206. word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
  207. pairs = get_pairs(word)
  208. if not pairs:
  209. return token
  210. while True:
  211. bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
  212. if bigram not in self.bpe_ranks:
  213. break
  214. first, second = bigram
  215. new_word = []
  216. i = 0
  217. while i < len(word):
  218. try:
  219. j = word.index(first, i)
  220. except ValueError:
  221. new_word.extend(word[i:])
  222. break
  223. else:
  224. new_word.extend(word[i:j])
  225. i = j
  226. if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
  227. new_word.append(first + second)
  228. i += 2
  229. else:
  230. new_word.append(word[i])
  231. i += 1
  232. new_word = tuple(new_word)
  233. word = new_word
  234. if len(word) == 1:
  235. break
  236. else:
  237. pairs = get_pairs(word)
  238. word = "@@ ".join(word)
  239. word = word[:-4]
  240. self.cache[token] = word
  241. return word
  242. def _tokenize(self, text):
  243. """Tokenize a string."""
  244. if self.normalization: # Perform Tweet normalization before performing BPE
  245. text = self.normalizeTweet(text)
  246. split_tokens = []
  247. words = re.findall(r"\S+\n?", text)
  248. for token in words:
  249. split_tokens.extend(list(self.bpe(token).split(" ")))
  250. return split_tokens
  251. def normalizeTweet(self, tweet):
  252. """
  253. Normalize a raw Tweet
  254. """
  255. for punct in self.special_puncts:
  256. tweet = tweet.replace(punct, self.special_puncts[punct])
  257. tokens = self.tweetPreprocessor.tokenize(tweet)
  258. normTweet = " ".join([self.normalizeToken(token) for token in tokens])
  259. normTweet = (
  260. normTweet.replace("cannot ", "can not ")
  261. .replace("n't ", " n't ")
  262. .replace("n 't ", " n't ")
  263. .replace("ca n't", "can't")
  264. .replace("ai n't", "ain't")
  265. )
  266. normTweet = (
  267. normTweet.replace("'m ", " 'm ")
  268. .replace("'re ", " 're ")
  269. .replace("'s ", " 's ")
  270. .replace("'ll ", " 'll ")
  271. .replace("'d ", " 'd ")
  272. .replace("'ve ", " 've ")
  273. )
  274. normTweet = (
  275. normTweet.replace(" p . m .", " p.m.")
  276. .replace(" p . m ", " p.m ")
  277. .replace(" a . m .", " a.m.")
  278. .replace(" a . m ", " a.m ")
  279. )
  280. return " ".join(normTweet.split())
  281. def normalizeToken(self, token):
  282. """
  283. Normalize tokens in a Tweet
  284. """
  285. lowercased_token = token.lower()
  286. if token.startswith("@"):
  287. return "@USER"
  288. elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
  289. return "HTTPURL"
  290. elif len(token) == 1:
  291. if token in self.special_puncts:
  292. return self.special_puncts[token]
  293. if self.demojizer is not None:
  294. return self.demojizer(token)
  295. else:
  296. return token
  297. else:
  298. return token
  299. def _convert_token_to_id(self, token):
  300. """Converts a token (str) in an id using the vocab."""
  301. return self.encoder.get(token, self.encoder.get(self.unk_token))
  302. def _convert_id_to_token(self, index):
  303. """Converts an index (integer) in a token (str) using the vocab."""
  304. return self.decoder.get(index, self.unk_token)
  305. def convert_tokens_to_string(self, tokens):
  306. """Converts a sequence of tokens (string) in a single string."""
  307. out_string = " ".join(tokens).replace("@@ ", "").strip()
  308. return out_string
  309. def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
  310. if not os.path.isdir(save_directory):
  311. logger.error(f"Vocabulary path ({save_directory}) should be a directory")
  312. return
  313. out_vocab_file = os.path.join(
  314. save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
  315. )
  316. out_merge_file = os.path.join(
  317. save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
  318. )
  319. if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
  320. copyfile(self.vocab_file, out_vocab_file)
  321. elif not os.path.isfile(self.vocab_file):
  322. with open(out_vocab_file, "wb") as fi:
  323. content_spiece_model = self.sp_model.serialized_model_proto()
  324. fi.write(content_spiece_model)
  325. if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
  326. copyfile(self.merges_file, out_merge_file)
  327. return out_vocab_file, out_merge_file
  328. # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
  329. # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
  330. # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
  331. # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
  332. # return ''.join(tokens_generated_so_far)
  333. def add_from_file(self, f):
  334. """
  335. Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
  336. """
  337. if isinstance(f, str):
  338. try:
  339. with open(f, "r", encoding="utf-8") as fd:
  340. self.add_from_file(fd)
  341. except FileNotFoundError as fnfe:
  342. raise fnfe
  343. except UnicodeError:
  344. raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
  345. return
  346. lines = f.readlines()
  347. for lineTmp in lines:
  348. line = lineTmp.strip()
  349. idx = line.rfind(" ")
  350. if idx == -1:
  351. raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
  352. word = line[:idx]
  353. self.encoder[word] = len(self.encoder)
  354. # Natural Language Toolkit: Twitter Tokenizer
  355. #
  356. # Copyright (C) 2001-2020 NLTK Project
  357. # Author: Christopher Potts <cgpotts@stanford.edu>
  358. # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
  359. # Pierpaolo Pantone <> (modifications)
  360. # URL: http://nltk.org/
  361. # For license information, see LICENSE.TXT
  362. #
  363. """
  364. Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
  365. 1. The tuple regex_strings defines a list of regular expression strings.
  366. 2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
  367. 3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
  368. the class Tokenizer.
  369. 4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
  370. is set to False, then the tokenizer will lowercase everything except for emoticons.
  371. """
  372. ######################################################################
  373. #
  374. # import regex # https://github.com/nltk/nltk/issues/2409
  375. # import html
  376. #
  377. ######################################################################
  378. # The following strings are components in the regular expression
  379. # that is used for tokenizing. It's important that phone_number
  380. # appears first in the final regex (since it can contain whitespace).
  381. # It also could matter that tags comes after emoticons, due to the
  382. # possibility of having text like
  383. #
  384. # <:| and some text >:)
  385. #
  386. # Most importantly, the final element should always be last, since it
  387. # does a last ditch whitespace-based tokenization of whatever is left.
  388. # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
  389. # This particular element is used in a couple ways, so we define it
  390. # with a name:
  391. # docstyle-ignore
  392. EMOTICONS = r"""
  393. (?:
  394. [<>]?
  395. [:;=8] # eyes
  396. [\-o\*\']? # optional nose
  397. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  398. |
  399. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  400. [\-o\*\']? # optional nose
  401. [:;=8] # eyes
  402. [<>]?
  403. |
  404. <3 # heart
  405. )"""
  406. # URL pattern due to John Gruber, modified by Tom Winzig. See
  407. # https://gist.github.com/winzig/8894715
  408. # docstyle-ignore
  409. URLS = r""" # Capture 1: entire matched URL
  410. (?:
  411. https?: # URL protocol and colon
  412. (?:
  413. /{1,3} # 1-3 slashes
  414. | # or
  415. [a-z0-9%] # Single letter or digit or '%'
  416. # (Trying not to match e.g. "URI::Escape")
  417. )
  418. | # or
  419. # looks like domain name followed by a slash:
  420. [a-z0-9.\-]+[.]
  421. (?:[a-z]{2,13})
  422. /
  423. )
  424. (?: # One or more:
  425. [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
  426. | # or
  427. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  428. |
  429. \([^\s]+?\) # balanced parens, non-recursive: (...)
  430. )+
  431. (?: # End with:
  432. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  433. |
  434. \([^\s]+?\) # balanced parens, non-recursive: (...)
  435. | # or
  436. [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
  437. )
  438. | # OR, the following to match naked domains:
  439. (?:
  440. (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
  441. [a-z0-9]+
  442. (?:[.\-][a-z0-9]+)*
  443. [.]
  444. (?:[a-z]{2,13})
  445. \b
  446. /?
  447. (?!@) # not succeeded by a @,
  448. # avoid matching "foo.na" in "foo.na@example.com"
  449. )
  450. """
  451. # docstyle-ignore
  452. # The components of the tokenizer:
  453. REGEXPS = (
  454. URLS,
  455. # Phone numbers:
  456. r"""
  457. (?:
  458. (?: # (international)
  459. \+?[01]
  460. [ *\-.\)]*
  461. )?
  462. (?: # (area code)
  463. [\(]?
  464. \d{3}
  465. [ *\-.\)]*
  466. )?
  467. \d{3} # exchange
  468. [ *\-.\)]*
  469. \d{4} # base
  470. )""",
  471. # ASCII Emoticons
  472. EMOTICONS,
  473. # HTML tags:
  474. r"""<[^>\s]+>""",
  475. # ASCII Arrows
  476. r"""[\-]+>|<[\-]+""",
  477. # Twitter username:
  478. r"""(?:@[\w_]+)""",
  479. # Twitter hashtags:
  480. r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
  481. # email addresses
  482. r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
  483. # docstyle-ignore
  484. # Remaining word types:
  485. r"""
  486. (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
  487. |
  488. (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
  489. |
  490. (?:[\w_]+) # Words without apostrophes or dashes.
  491. |
  492. (?:\.(?:\s*\.){1,}) # Ellipsis dots.
  493. |
  494. (?:\S) # Everything else that isn't whitespace.
  495. """,
  496. )
  497. ######################################################################
  498. # This is the core tokenizing regex:
  499. WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
  500. # WORD_RE performs poorly on these patterns:
  501. HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
  502. # The emoticon string gets its own regex so that we can preserve case for
  503. # them as needed:
  504. EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
  505. # These are for regularizing HTML entities to Unicode:
  506. ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
  507. ######################################################################
  508. # Functions for converting html entities
  509. ######################################################################
  510. def _str_to_unicode(text, encoding=None, errors="strict"):
  511. if encoding is None:
  512. encoding = "utf-8"
  513. if isinstance(text, bytes):
  514. return text.decode(encoding, errors)
  515. return text
  516. def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
  517. """
  518. Remove entities from text by converting them to their corresponding unicode character.
  519. Args:
  520. text:
  521. A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
  522. keep (list):
  523. List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
  524. `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
  525. remove_illegal (bool):
  526. If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
  527. kept "as is".
  528. Returns: A unicode string with the entities removed.
  529. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
  530. Examples:
  531. ```python
  532. >>> from nltk.tokenize.casual import _replace_html_entities
  533. >>> _replace_html_entities(b"Price: &pound;100")
  534. 'Price: \\xa3100'
  535. >>> print(_replace_html_entities(b"Price: &pound;100"))
  536. Price: £100
  537. ```"""
  538. def _convert_entity(match):
  539. entity_body = match.group(3)
  540. if match.group(1):
  541. try:
  542. if match.group(2):
  543. number = int(entity_body, 16)
  544. else:
  545. number = int(entity_body, 10)
  546. # Numeric character references in the 80-9F range are typically
  547. # interpreted by browsers as representing the characters mapped
  548. # to bytes 80-9F in the Windows-1252 encoding. For more info
  549. # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
  550. if 0x80 <= number <= 0x9F:
  551. return bytes((number,)).decode("cp1252")
  552. except ValueError:
  553. number = None
  554. else:
  555. if entity_body in keep:
  556. return match.group(0)
  557. else:
  558. number = html.entities.name2codepoint.get(entity_body)
  559. if number is not None:
  560. try:
  561. return chr(number)
  562. except (ValueError, OverflowError):
  563. pass
  564. return "" if remove_illegal else match.group(0)
  565. return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
  566. ######################################################################
  567. class TweetTokenizer:
  568. r"""
  569. Examples:
  570. ```python
  571. >>> # Tokenizer for tweets.
  572. >>> from nltk.tokenize import TweetTokenizer
  573. >>> tknzr = TweetTokenizer()
  574. >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
  575. >>> tknzr.tokenize(s0)
  576. ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
  577. >>> # Examples using *strip_handles* and *reduce_len parameters*:
  578. >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
  579. >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
  580. >>> tknzr.tokenize(s1)
  581. [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
  582. ```"""
  583. def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
  584. self.preserve_case = preserve_case
  585. self.reduce_len = reduce_len
  586. self.strip_handles = strip_handles
  587. def tokenize(self, text):
  588. """
  589. Args:
  590. text: str
  591. Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
  592. `preserve_case=False`
  593. """
  594. # Fix HTML character entities:
  595. text = _replace_html_entities(text)
  596. # Remove username handles
  597. if self.strip_handles:
  598. text = remove_handles(text)
  599. # Normalize word lengthening
  600. if self.reduce_len:
  601. text = reduce_lengthening(text)
  602. # Shorten problematic sequences of characters
  603. safe_text = HANG_RE.sub(r"\1\1\1", text)
  604. # Tokenize:
  605. words = WORD_RE.findall(safe_text)
  606. # Possibly alter the case, but avoid changing emoticons like :D into :d:
  607. if not self.preserve_case:
  608. words = [x if EMOTICON_RE.search(x) else x.lower() for x in words]
  609. return words
  610. ######################################################################
  611. # Normalization Functions
  612. ######################################################################
  613. def reduce_lengthening(text):
  614. """
  615. Replace repeated character sequences of length 3 or greater with sequences of length 3.
  616. """
  617. pattern = regex.compile(r"(.)\1{2,}")
  618. return pattern.sub(r"\1\1\1", text)
  619. def remove_handles(text):
  620. """
  621. Remove Twitter username handles from text.
  622. """
  623. pattern = regex.compile(
  624. r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
  625. )
  626. # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
  627. return pattern.sub(" ", text)
  628. ######################################################################
  629. # Tokenization Function
  630. ######################################################################
  631. def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
  632. """
  633. Convenience function for wrapping the tokenizer.
  634. """
  635. return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(
  636. text
  637. )
  638. ###############################################################################