XMD
/
Lightstar


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
							# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""English Normalizer class for CLVP."""

import re


class EnglishNormalizer:
    def __init__(self):
        # List of (regular expression, replacement) pairs for abbreviations:
        self._abbreviations = [
            (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
            for x in [
                ("mrs", "misess"),
                ("mr", "mister"),
                ("dr", "doctor"),
                ("st", "saint"),
                ("co", "company"),
                ("jr", "junior"),
                ("maj", "major"),
                ("gen", "general"),
                ("drs", "doctors"),
                ("rev", "reverend"),
                ("lt", "lieutenant"),
                ("hon", "honorable"),
                ("sgt", "sergeant"),
                ("capt", "captain"),
                ("esq", "esquire"),
                ("ltd", "limited"),
                ("col", "colonel"),
                ("ft", "fort"),
            ]
        ]

        self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
        self.teens = [
            "ten",
            "eleven",
            "twelve",
            "thirteen",
            "fourteen",
            "fifteen",
            "sixteen",
            "seventeen",
            "eighteen",
            "nineteen",
        ]
        self.tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

    def number_to_words(self, num: int) -> str:
        """
        Converts numbers(`int`) to words(`str`).

        Please note that it only supports upto - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
        trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
        thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
        """
        if num == 0:
            return "zero"
        elif num < 0:
            return "minus " + self.number_to_words(abs(num))
        elif num < 10:
            return self.ones[num]
        elif num < 20:
            return self.teens[num - 10]
        elif num < 100:
            return self.tens[num // 10] + ("-" + self.number_to_words(num % 10) if num % 10 != 0 else "")
        elif num < 1000:
            return (
                self.ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100) if num % 100 != 0 else "")
            )
        elif num < 1_000_000:
            return (
                self.number_to_words(num // 1000)
                + " thousand"
                + (", " + self.number_to_words(num % 1000) if num % 1000 != 0 else "")
            )
        elif num < 1_000_000_000:
            return (
                self.number_to_words(num // 1_000_000)
                + " million"
                + (", " + self.number_to_words(num % 1_000_000) if num % 1_000_000 != 0 else "")
            )
        elif num < 1_000_000_000_000:
            return (
                self.number_to_words(num // 1_000_000_000)
                + " billion"
                + (", " + self.number_to_words(num % 1_000_000_000) if num % 1_000_000_000 != 0 else "")
            )
        elif num < 1_000_000_000_000_000:
            return (
                self.number_to_words(num // 1_000_000_000_000)
                + " trillion"
                + (", " + self.number_to_words(num % 1_000_000_000_000) if num % 1_000_000_000_000 != 0 else "")
            )
        elif num < 1_000_000_000_000_000_000:
            return (
                self.number_to_words(num // 1_000_000_000_000_000)
                + " quadrillion"
                + (
                    ", " + self.number_to_words(num % 1_000_000_000_000_000)
                    if num % 1_000_000_000_000_000 != 0
                    else ""
                )
            )
        else:
            return "number out of range"

    def convert_to_ascii(self, text: str) -> str:
        """
        Converts unicode to ascii
        """
        return text.encode("ascii", "ignore").decode("utf-8")

    def _expand_dollars(self, m: str) -> str:
        """
        This method is used to expand numerical dollar values into spoken words.
        """
        match = m.group(1)
        parts = match.split(".")
        if len(parts) > 2:
            return match + " dollars"  # Unexpected format

        dollars = int(parts[0]) if parts[0] else 0
        cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
        if dollars and cents:
            dollar_unit = "dollar" if dollars == 1 else "dollars"
            cent_unit = "cent" if cents == 1 else "cents"
            return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
        elif dollars:
            dollar_unit = "dollar" if dollars == 1 else "dollars"
            return "%s %s" % (dollars, dollar_unit)
        elif cents:
            cent_unit = "cent" if cents == 1 else "cents"
            return "%s %s" % (cents, cent_unit)
        else:
            return "zero dollars"

    def _remove_commas(self, m: str) -> str:
        """
        This method is used to remove commas from sentences.
        """
        return m.group(1).replace(",", "")

    def _expand_decimal_point(self, m: str) -> str:
        """
        This method is used to expand '.' into spoken word ' point '.
        """
        return m.group(1).replace(".", " point ")

    def _expand_ordinal(self, num: str) -> str:
        """
        This method is used to expand ordinals such as '1st', '2nd' into spoken words.
        """
        ordinal_suffixes = {1: "st", 2: "nd", 3: "rd"}

        num = int(num.group(0)[:-2])
        if 10 <= num % 100 and num % 100 <= 20:
            suffix = "th"
        else:
            suffix = ordinal_suffixes.get(num % 10, "th")
        return self.number_to_words(num) + suffix

    def _expand_number(self, m: str) -> str:
        """
        This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
        link :
        https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
        """
        num = int(m.group(0))

        if num > 1000 and num < 3000:
            if num == 2000:
                return "two thousand"
            elif num > 2000 and num < 2010:
                return "two thousand " + self.number_to_words(num % 100)
            elif num % 100 == 0:
                return self.number_to_words(num // 100) + " hundred"
            else:
                return self.number_to_words(num)
        else:
            return self.number_to_words(num)

    def normalize_numbers(self, text: str) -> str:
        """
        This method is used to normalize numbers within a text such as converting the numbers to words, removing
        commas, etc.
        """
        text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
        text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
        text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
        text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
        text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
        text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
        return text

    def expand_abbreviations(self, text: str) -> str:
        """
        Expands the abbreviate words.
        """
        for regex, replacement in self._abbreviations:
            text = re.sub(regex, replacement, text)
        return text

    def collapse_whitespace(self, text: str) -> str:
        """
        Removes multiple whitespaces
        """
        return re.sub(re.compile(r"\s+"), " ", text)

    def __call__(self, text):
        """
        Converts text to ascii, numbers / number-like quantities to their spelt-out counterparts and expands
        abbreviations
        """

        text = self.convert_to_ascii(text)
        text = text.lower()
        text = self.normalize_numbers(text)
        text = self.expand_abbreviations(text)
        text = self.collapse_whitespace(text)
        text = text.replace('"', "")

        return text