rfc822.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
  2. # Copyright 2002-2008 Mark Pilgrim
  3. # All rights reserved.
  4. #
  5. # This file is a part of feedparser.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  17. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. # POSSIBILITY OF SUCH DAMAGE.
  27. import datetime
  28. timezone_names = {
  29. 'ut': 0, 'gmt': 0, 'z': 0,
  30. 'adt': -3, 'ast': -4, 'at': -4,
  31. 'edt': -4, 'est': -5, 'et': -5,
  32. 'cdt': -5, 'cst': -6, 'ct': -6,
  33. 'mdt': -6, 'mst': -7, 'mt': -7,
  34. 'pdt': -7, 'pst': -8, 'pt': -8,
  35. 'a': -1, 'n': 1,
  36. 'm': -12, 'y': 12,
  37. 'met': 1, 'mest': 2,
  38. }
  39. day_names = {'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'}
  40. months = {
  41. 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
  42. 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
  43. }
  44. def _parse_date_rfc822(date):
  45. """Parse RFC 822 dates and times
  46. http://tools.ietf.org/html/rfc822#section-5
  47. There are some formatting differences that are accounted for:
  48. 1. Years may be two or four digits.
  49. 2. The month and day can be swapped.
  50. 3. Additional timezone names are supported.
  51. 4. A default time and timezone are assumed if only a date is present.
  52. :param str date: a date/time string that will be converted to a time tuple
  53. :returns: a UTC time tuple, or None
  54. :rtype: time.struct_time | None
  55. """
  56. parts = date.lower().split()
  57. if len(parts) < 5:
  58. # Assume that the time and timezone are missing
  59. parts.extend(('00:00:00', '0000'))
  60. # Remove the day name
  61. if parts[0][:3] in day_names:
  62. parts = parts[1:]
  63. if len(parts) < 5:
  64. # If there are still fewer than five parts, there's not enough
  65. # information to interpret this.
  66. return None
  67. # Handle the day and month name.
  68. month = months.get(parts[1][:3])
  69. try:
  70. day = int(parts[0])
  71. except ValueError:
  72. # Check if the day and month are swapped.
  73. if months.get(parts[0][:3]):
  74. try:
  75. day = int(parts[1])
  76. except ValueError:
  77. return None
  78. month = months.get(parts[0][:3])
  79. else:
  80. return None
  81. if not month:
  82. return None
  83. # Handle the year.
  84. try:
  85. year = int(parts[2])
  86. except ValueError:
  87. return None
  88. # Normalize two-digit years:
  89. # Anything in the 90's is interpreted as 1990 and on.
  90. # Anything 89 or less is interpreted as 2089 or before.
  91. if len(parts[2]) <= 2:
  92. year += (1900, 2000)[year < 90]
  93. # Handle the time (default to 00:00:00).
  94. time_parts = parts[3].split(':')
  95. time_parts.extend(('0',) * (3 - len(time_parts)))
  96. try:
  97. (hour, minute, second) = [int(i) for i in time_parts]
  98. except ValueError:
  99. return None
  100. # Handle the timezone information, if any (default to +0000).
  101. # Strip 'Etc/' from the timezone.
  102. if parts[4].startswith('etc/'):
  103. parts[4] = parts[4][4:]
  104. # Normalize timezones that start with 'gmt':
  105. # GMT-05:00 => -0500
  106. # GMT => GMT
  107. if parts[4].startswith('gmt'):
  108. parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
  109. # Handle timezones like '-0500', '+0500', and 'EST'
  110. if parts[4] and parts[4][0] in ('-', '+'):
  111. try:
  112. if ':' in parts[4]:
  113. timezone_hours = int(parts[4][1:3])
  114. timezone_minutes = int(parts[4][4:])
  115. else:
  116. timezone_hours = int(parts[4][1:3])
  117. timezone_minutes = int(parts[4][3:])
  118. except ValueError:
  119. return None
  120. if parts[4].startswith('-'):
  121. timezone_hours *= -1
  122. timezone_minutes *= -1
  123. else:
  124. timezone_hours = timezone_names.get(parts[4], 0)
  125. timezone_minutes = 0
  126. # Create the datetime object and timezone delta objects
  127. try:
  128. stamp = datetime.datetime(year, month, day, hour, minute, second)
  129. except ValueError:
  130. return None
  131. delta = datetime.timedelta(0, 0, 0, 0, timezone_minutes, timezone_hours)
  132. # Return the date and timestamp in a UTC 9-tuple
  133. try:
  134. return (stamp - delta).utctimetuple()
  135. except (OverflowError, ValueError):
  136. # IronPython throws ValueErrors instead of OverflowErrors
  137. return None