| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- # coding=utf-8
- # Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Number Normalizer class for SpeechT5."""
- import re
- class EnglishNumberNormalizer:
- def __init__(self):
- self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
- self.teens = [
- "",
- "eleven",
- "twelve",
- "thirteen",
- "fourteen",
- "fifteen",
- "sixteen",
- "seventeen",
- "eighteen",
- "nineteen",
- ]
- self.tens = ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
- self.thousands = [
- "",
- "thousand",
- "million",
- "billion",
- "trillion",
- "quadrillion",
- "quintillion",
- "sextillion",
- "septillion",
- "octillion",
- "nonillion",
- "decillion",
- ]
- # Define a dictionary to map currency symbols to their names
- # Top most traded currencies according to
- # https://en.wikipedia.org/wiki/Template:Most_traded_currencies
- self.currency_symbols = {
- "$": " dollars",
- "€": " euros",
- "£": " pounds",
- "¢": " cents",
- "¥": " japanese yen",
- "﷼": " saudi riyal",
- "₹": " indian rupees",
- "₽": " russian rubles",
- "฿": " thai baht",
- "₺": " turkish liras",
- "₴": " ukrainian hryvnia",
- "₣": " swiss francs",
- "₡": " costa rican colon",
- "₱": " philippine peso",
- "₪": " israeli shekels",
- "₮": " mongolian tögrög",
- "₩": " south korean won",
- "₦": " nigerian naira",
- "₫": " vietnamese Đồng",
- }
- def spell_number(self, num):
- if num == 0:
- return "zero"
- parts = []
- for i in range(0, len(self.thousands)):
- if num % 1000 != 0:
- part = ""
- hundreds = num % 1000 // 100
- tens_units = num % 100
- if hundreds > 0:
- part += self.ones[hundreds] + " hundred"
- if tens_units > 0:
- part += " and "
- if tens_units > 10 and tens_units < 20:
- part += self.teens[tens_units - 10]
- else:
- tens_digit = self.tens[tens_units // 10]
- ones_digit = self.ones[tens_units % 10]
- if tens_digit:
- part += tens_digit
- if ones_digit:
- if tens_digit:
- part += " "
- part += ones_digit
- parts.append(part)
- num //= 1000
- return " ".join(reversed(parts))
- def convert(self, number):
- """
- Converts an individual number passed in string form to spelt-out form
- """
- if "." in number:
- integer_part, decimal_part = number.split(".")
- else:
- integer_part, decimal_part = number, "00"
- # Extract currency symbol if present
- currency_symbol = ""
- for symbol, name in self.currency_symbols.items():
- if integer_part.startswith(symbol):
- currency_symbol = name
- integer_part = integer_part[len(symbol) :]
- break
- if integer_part.startswith("-"):
- if integer_part[1:].startswith(symbol):
- currency_symbol = name
- integer_part = "-" + integer_part[len(symbol) + 1 :]
- break
- # Extract 'minus' prefix for negative numbers
- minus_prefix = ""
- if integer_part.startswith("-"):
- minus_prefix = "minus "
- integer_part = integer_part[1:]
- elif integer_part.startswith("minus"):
- minus_prefix = "minus "
- integer_part = integer_part[len("minus") :]
- percent_suffix = ""
- if "%" in integer_part or "%" in decimal_part:
- percent_suffix = " percent"
- integer_part = integer_part.replace("%", "")
- decimal_part = decimal_part.replace("%", "")
- integer_part = integer_part.zfill(3 * ((len(integer_part) - 1) // 3 + 1))
- parts = []
- for i in range(0, len(integer_part), 3):
- chunk = int(integer_part[i : i + 3])
- if chunk > 0:
- part = self.spell_number(chunk)
- unit = self.thousands[len(integer_part[i:]) // 3 - 1]
- if unit:
- part += " " + unit
- parts.append(part)
- spelled_integer = " ".join(parts)
- # Format the spelt-out number based on conditions, such as:
- # If it has decimal parts, currency symbol, minus prefix, etc
- if decimal_part == "00":
- return (
- f"{minus_prefix}{spelled_integer}{percent_suffix}{currency_symbol}"
- if minus_prefix or currency_symbol
- else f"{spelled_integer}{percent_suffix}"
- )
- else:
- spelled_decimal = " ".join([self.spell_number(int(digit)) for digit in decimal_part])
- return (
- f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}{currency_symbol}"
- if minus_prefix or currency_symbol
- else f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}"
- )
- def __call__(self, text):
- """
- Convert numbers / number-like quantities in a string to their spelt-out counterparts
- """
- # Form part of the pattern for all currency symbols
- pattern = r"(?<!\w)(-?\$?\€?\£?\¢?\¥?\₹?\₽?\฿?\₺?\₴?\₣?\₡?\₱?\₪?\₮?\₩?\₦?\₫?\﷼?\d+(?:\.\d{1,2})?%?)(?!\w)"
- # Find and replace commas in numbers (15,000 -> 15000, etc)
- text = re.sub(r"(\d+,\d+)", lambda match: match.group(1).replace(",", ""), text)
- # Use regex to find and replace numbers in the text
- converted_text = re.sub(pattern, lambda match: self.convert(match.group(1)), text)
- converted_text = re.sub(" +", " ", converted_text)
- return converted_text
|