number_normalizer.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. # coding=utf-8
  2. # Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Number Normalizer class for SpeechT5."""
  16. import re
  17. class EnglishNumberNormalizer:
  18. def __init__(self):
  19. self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
  20. self.teens = [
  21. "",
  22. "eleven",
  23. "twelve",
  24. "thirteen",
  25. "fourteen",
  26. "fifteen",
  27. "sixteen",
  28. "seventeen",
  29. "eighteen",
  30. "nineteen",
  31. ]
  32. self.tens = ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
  33. self.thousands = [
  34. "",
  35. "thousand",
  36. "million",
  37. "billion",
  38. "trillion",
  39. "quadrillion",
  40. "quintillion",
  41. "sextillion",
  42. "septillion",
  43. "octillion",
  44. "nonillion",
  45. "decillion",
  46. ]
  47. # Define a dictionary to map currency symbols to their names
  48. # Top most traded currencies according to
  49. # https://en.wikipedia.org/wiki/Template:Most_traded_currencies
  50. self.currency_symbols = {
  51. "$": " dollars",
  52. "€": " euros",
  53. "£": " pounds",
  54. "¢": " cents",
  55. "¥": " japanese yen",
  56. "﷼": " saudi riyal",
  57. "₹": " indian rupees",
  58. "₽": " russian rubles",
  59. "฿": " thai baht",
  60. "₺": " turkish liras",
  61. "₴": " ukrainian hryvnia",
  62. "₣": " swiss francs",
  63. "₡": " costa rican colon",
  64. "₱": " philippine peso",
  65. "₪": " israeli shekels",
  66. "₮": " mongolian tögrög",
  67. "₩": " south korean won",
  68. "₦": " nigerian naira",
  69. "₫": " vietnamese Đồng",
  70. }
  71. def spell_number(self, num):
  72. if num == 0:
  73. return "zero"
  74. parts = []
  75. for i in range(0, len(self.thousands)):
  76. if num % 1000 != 0:
  77. part = ""
  78. hundreds = num % 1000 // 100
  79. tens_units = num % 100
  80. if hundreds > 0:
  81. part += self.ones[hundreds] + " hundred"
  82. if tens_units > 0:
  83. part += " and "
  84. if tens_units > 10 and tens_units < 20:
  85. part += self.teens[tens_units - 10]
  86. else:
  87. tens_digit = self.tens[tens_units // 10]
  88. ones_digit = self.ones[tens_units % 10]
  89. if tens_digit:
  90. part += tens_digit
  91. if ones_digit:
  92. if tens_digit:
  93. part += " "
  94. part += ones_digit
  95. parts.append(part)
  96. num //= 1000
  97. return " ".join(reversed(parts))
  98. def convert(self, number):
  99. """
  100. Converts an individual number passed in string form to spelt-out form
  101. """
  102. if "." in number:
  103. integer_part, decimal_part = number.split(".")
  104. else:
  105. integer_part, decimal_part = number, "00"
  106. # Extract currency symbol if present
  107. currency_symbol = ""
  108. for symbol, name in self.currency_symbols.items():
  109. if integer_part.startswith(symbol):
  110. currency_symbol = name
  111. integer_part = integer_part[len(symbol) :]
  112. break
  113. if integer_part.startswith("-"):
  114. if integer_part[1:].startswith(symbol):
  115. currency_symbol = name
  116. integer_part = "-" + integer_part[len(symbol) + 1 :]
  117. break
  118. # Extract 'minus' prefix for negative numbers
  119. minus_prefix = ""
  120. if integer_part.startswith("-"):
  121. minus_prefix = "minus "
  122. integer_part = integer_part[1:]
  123. elif integer_part.startswith("minus"):
  124. minus_prefix = "minus "
  125. integer_part = integer_part[len("minus") :]
  126. percent_suffix = ""
  127. if "%" in integer_part or "%" in decimal_part:
  128. percent_suffix = " percent"
  129. integer_part = integer_part.replace("%", "")
  130. decimal_part = decimal_part.replace("%", "")
  131. integer_part = integer_part.zfill(3 * ((len(integer_part) - 1) // 3 + 1))
  132. parts = []
  133. for i in range(0, len(integer_part), 3):
  134. chunk = int(integer_part[i : i + 3])
  135. if chunk > 0:
  136. part = self.spell_number(chunk)
  137. unit = self.thousands[len(integer_part[i:]) // 3 - 1]
  138. if unit:
  139. part += " " + unit
  140. parts.append(part)
  141. spelled_integer = " ".join(parts)
  142. # Format the spelt-out number based on conditions, such as:
  143. # If it has decimal parts, currency symbol, minus prefix, etc
  144. if decimal_part == "00":
  145. return (
  146. f"{minus_prefix}{spelled_integer}{percent_suffix}{currency_symbol}"
  147. if minus_prefix or currency_symbol
  148. else f"{spelled_integer}{percent_suffix}"
  149. )
  150. else:
  151. spelled_decimal = " ".join([self.spell_number(int(digit)) for digit in decimal_part])
  152. return (
  153. f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}{currency_symbol}"
  154. if minus_prefix or currency_symbol
  155. else f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}"
  156. )
  157. def __call__(self, text):
  158. """
  159. Convert numbers / number-like quantities in a string to their spelt-out counterparts
  160. """
  161. # Form part of the pattern for all currency symbols
  162. pattern = r"(?<!\w)(-?\$?\€?\£?\¢?\¥?\₹?\₽?\฿?\₺?\₴?\₣?\₡?\₱?\₪?\₮?\₩?\₦?\₫?\﷼?\d+(?:\.\d{1,2})?%?)(?!\w)"
  163. # Find and replace commas in numbers (15,000 -> 15000, etc)
  164. text = re.sub(r"(\d+,\d+)", lambda match: match.group(1).replace(",", ""), text)
  165. # Use regex to find and replace numbers in the text
  166. converted_text = re.sub(pattern, lambda match: self.convert(match.group(1)), text)
  167. converted_text = re.sub(" +", " ", converted_text)
  168. return converted_text