| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403 |
- import itertools
- import os
- import re
- from string import Template
- from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
- from tokenizers import Encoding, Tokenizer
- dirname = os.path.dirname(__file__)
- css_filename = os.path.join(dirname, "visualizer-styles.css")
- with open(css_filename) as f:
- css = f.read()
- class Annotation:
- start: int
- end: int
- label: int
- def __init__(self, start: int, end: int, label: str):
- self.start = start
- self.end = end
- self.label = label
- AnnotationList = List[Annotation]
- PartialIntList = List[Optional[int]]
- class CharStateKey(NamedTuple):
- token_ix: Optional[int]
- anno_ix: Optional[int]
- class CharState:
- char_ix: Optional[int]
- def __init__(self, char_ix):
- self.char_ix = char_ix
- self.anno_ix: Optional[int] = None
- self.tokens: List[int] = []
- @property
- def token_ix(self):
- return self.tokens[0] if len(self.tokens) > 0 else None
- @property
- def is_multitoken(self):
- """
- BPE tokenizers can output more than one token for a char
- """
- return len(self.tokens) > 1
- def partition_key(self) -> CharStateKey:
- return CharStateKey(
- token_ix=self.token_ix,
- anno_ix=self.anno_ix,
- )
- class Aligned:
- pass
- class EncodingVisualizer:
- """
- Build an EncodingVisualizer
- Args:
- tokenizer (:class:`~tokenizers.Tokenizer`):
- A tokenizer instance
- default_to_notebook (:obj:`bool`):
- Whether to render html output in a notebook by default
- annotation_converter (:obj:`Callable`, `optional`):
- An optional (lambda) function that takes an annotation in any format and returns
- an Annotation object
- """
- unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
- def __init__(
- self,
- tokenizer: Tokenizer,
- default_to_notebook: bool = True,
- annotation_converter: Optional[Callable[[Any], Annotation]] = None,
- ):
- if default_to_notebook:
- try:
- from IPython.core.display import HTML, display
- except ImportError:
- raise Exception(
- """We couldn't import IPython utils for html display.
- Are you running in a notebook?
- You can also pass `default_to_notebook=False` to get back raw HTML
- """
- )
- self.tokenizer = tokenizer
- self.default_to_notebook = default_to_notebook
- self.annotation_coverter = annotation_converter
- pass
- def __call__(
- self,
- text: str,
- annotations: AnnotationList = [],
- default_to_notebook: Optional[bool] = None,
- ) -> Optional[str]:
- """
- Build a visualization of the given text
- Args:
- text (:obj:`str`):
- The text to tokenize
- annotations (:obj:`List[Annotation]`, `optional`):
- An optional list of annotations of the text. The can either be an annotation class
- or anything else if you instantiated the visualizer with a converter function
- default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
- If True, will render the html in a notebook. Otherwise returns an html string.
- Returns:
- The HTML string if default_to_notebook is False, otherwise (default) returns None and
- renders the HTML in the notebook
- """
- final_default_to_notebook = self.default_to_notebook
- if default_to_notebook is not None:
- final_default_to_notebook = default_to_notebook
- if final_default_to_notebook:
- try:
- from IPython.core.display import HTML, display
- except ImportError:
- raise Exception(
- """We couldn't import IPython utils for html display.
- Are you running in a notebook?"""
- )
- if self.annotation_coverter is not None:
- annotations = list(map(self.annotation_coverter, annotations))
- encoding = self.tokenizer.encode(text)
- html = EncodingVisualizer.__make_html(text, encoding, annotations)
- if final_default_to_notebook:
- display(HTML(html))
- else:
- return html
- @staticmethod
- def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
- """
- Generates a color palette for all the labels in a given set of annotations
- Args:
- annotations (:obj:`Annotation`):
- A list of annotations
- Returns:
- :obj:`dict`: A dictionary mapping labels to colors in HSL format
- """
- if len(annotations) == 0:
- return {}
- labels = set(map(lambda x: x.label, annotations))
- num_labels = len(labels)
- h_step = int(255 / num_labels)
- if h_step < 20:
- h_step = 20
- s = 32
- l = 64 # noqa: E741
- h = 10
- colors = {}
- for label in sorted(labels): # sort so we always get the same colors for a given set of labels
- colors[label] = f"hsl({h},{s}%,{l}%"
- h += h_step
- return colors
- @staticmethod
- def consecutive_chars_to_html(
- consecutive_chars_list: List[CharState],
- text: str,
- encoding: Encoding,
- ):
- """
- Converts a list of "consecutive chars" into a single HTML element.
- Chars are consecutive if they fall under the same word, token and annotation.
- The CharState class is a named tuple with a "partition_key" method that makes it easy to
- compare if two chars are consecutive.
- Args:
- consecutive_chars_list (:obj:`List[CharState]`):
- A list of CharStates that have been grouped together
- text (:obj:`str`):
- The original text being processed
- encoding (:class:`~tokenizers.Encoding`):
- The encoding returned from the tokenizer
- Returns:
- :obj:`str`: The HTML span for a set of consecutive chars
- """
- first = consecutive_chars_list[0]
- if first.char_ix is None:
- # its a special token
- stoken = encoding.tokens[first.token_ix]
- # special tokens are represented as empty spans. We use the data attribute and css
- # magic to display it
- return f'<span class="special-token" data-stoken={stoken}></span>'
- # We're not in a special token so this group has a start and end.
- last = consecutive_chars_list[-1]
- start = first.char_ix
- end = last.char_ix + 1
- span_text = text[start:end]
- css_classes = [] # What css classes will we apply on the resulting span
- data_items = {} # What data attributes will we apply on the result span
- if first.token_ix is not None:
- # We can either be in a token or not (e.g. in white space)
- css_classes.append("token")
- if first.is_multitoken:
- css_classes.append("multi-token")
- if first.token_ix % 2:
- # We use this to color alternating tokens.
- # A token might be split by an annotation that ends in the middle of it, so this
- # lets us visually indicate a consecutive token despite its possible splitting in
- # the html markup
- css_classes.append("odd-token")
- else:
- # Like above, but a different color so we can see the tokens alternate
- css_classes.append("even-token")
- if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
- # This is a special token that is in the text. probably UNK
- css_classes.append("special-token")
- # TODO is this the right name for the data attribute ?
- data_items["stok"] = encoding.tokens[first.token_ix]
- else:
- # In this case we are looking at a group/single char that is not tokenized.
- # e.g. white space
- css_classes.append("non-token")
- css = f'''class="{' '.join(css_classes)}"'''
- data = ""
- for key, val in data_items.items():
- data += f' data-{key}="{val}"'
- return f"<span {css} {data} >{span_text}</span>"
- @staticmethod
- def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
- char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
- current_consecutive_chars = [char_states[0]]
- prev_anno_ix = char_states[0].anno_ix
- spans = []
- label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
- cur_anno_ix = char_states[0].anno_ix
- if cur_anno_ix is not None:
- # If we started in an annotation make a span for it
- anno = annotations[cur_anno_ix]
- label = anno.label
- color = label_colors_dict[label]
- spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
- for cs in char_states[1:]:
- cur_anno_ix = cs.anno_ix
- if cur_anno_ix != prev_anno_ix:
- # If we've transitioned in or out of an annotation
- spans.append(
- # Create a span from the current consecutive characters
- EncodingVisualizer.consecutive_chars_to_html(
- current_consecutive_chars,
- text=text,
- encoding=encoding,
- )
- )
- current_consecutive_chars = [cs]
- if prev_anno_ix is not None:
- # if we transitioned out of an annotation close it's span
- spans.append("</span>")
- if cur_anno_ix is not None:
- # If we entered a new annotation make a span for it
- anno = annotations[cur_anno_ix]
- label = anno.label
- color = label_colors_dict[label]
- spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
- prev_anno_ix = cur_anno_ix
- if cs.partition_key() == current_consecutive_chars[0].partition_key():
- # If the current charchter is in the same "group" as the previous one
- current_consecutive_chars.append(cs)
- else:
- # Otherwise we make a span for the previous group
- spans.append(
- EncodingVisualizer.consecutive_chars_to_html(
- current_consecutive_chars,
- text=text,
- encoding=encoding,
- )
- )
- # An reset the consecutive_char_list to form a new group
- current_consecutive_chars = [cs]
- # All that's left is to fill out the final span
- # TODO I think there is an edge case here where an annotation's span might not close
- spans.append(
- EncodingVisualizer.consecutive_chars_to_html(
- current_consecutive_chars,
- text=text,
- encoding=encoding,
- )
- )
- res = HTMLBody(spans) # Send the list of spans to the body of our html
- return res
- @staticmethod
- def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
- """
- Args:
- text (:obj:`str`):
- The raw text we want to align to
- annotations (:obj:`AnnotationList`):
- A (possibly empty) list of annotations
- Returns:
- A list of length len(text) whose entry at index i is None if there is no annotation on
- charachter i or k, the index of the annotation that covers index i where k is with
- respect to the list of annotations
- """
- annotation_map = [None] * len(text)
- for anno_ix, a in enumerate(annotations):
- for i in range(a.start, a.end):
- annotation_map[i] = anno_ix
- return annotation_map
- @staticmethod
- def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
- """
- For each character in the original text, we emit a tuple representing it's "state":
- * which token_ix it corresponds to
- * which word_ix it corresponds to
- * which annotation_ix it corresponds to
- Args:
- text (:obj:`str`):
- The raw text we want to align to
- annotations (:obj:`List[Annotation]`):
- A (possibly empty) list of annotations
- encoding: (:class:`~tokenizers.Encoding`):
- The encoding returned from the tokenizer
- Returns:
- :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
- it's state is
- """
- annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
- # Todo make this a dataclass or named tuple
- char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
- for token_ix, token in enumerate(encoding.tokens):
- offsets = encoding.token_to_chars(token_ix)
- if offsets is not None:
- start, end = offsets
- for i in range(start, end):
- char_states[i].tokens.append(token_ix)
- for char_ix, anno_ix in enumerate(annotation_map):
- char_states[char_ix].anno_ix = anno_ix
- return char_states
- def HTMLBody(children: List[str], css_styles=css) -> str:
- """
- Generates the full html with css from a list of html spans
- Args:
- children (:obj:`List[str]`):
- A list of strings, assumed to be html elements
- css_styles (:obj:`str`, `optional`):
- Optional alternative implementation of the css
- Returns:
- :obj:`str`: An HTML string with style markup
- """
- children_text = "".join(children)
- return f"""
- <html>
- <head>
- <style>
- {css_styles}
- </style>
- </head>
- <body>
- <div class="tokenized-text" dir=auto>
- {children_text}
- </div>
- </body>
- </html>
- """
|