_arff.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107
  1. # =============================================================================
  2. # Federal University of Rio Grande do Sul (UFRGS)
  3. # Connectionist Artificial Intelligence Laboratory (LIAC)
  4. # Renato de Pontes Pereira - rppereira@inf.ufrgs.br
  5. # =============================================================================
  6. # Copyright (c) 2011 Renato de Pontes Pereira, renato.ppontes at gmail dot com
  7. #
  8. # Permission is hereby granted, free of charge, to any person obtaining a copy
  9. # of this software and associated documentation files (the "Software"), to deal
  10. # in the Software without restriction, including without limitation the rights
  11. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. # copies of the Software, and to permit persons to whom the Software is
  13. # furnished to do so, subject to the following conditions:
  14. #
  15. # The above copyright notice and this permission notice shall be included in
  16. # all copies or substantial portions of the Software.
  17. #
  18. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. # SOFTWARE.
  25. # =============================================================================
  26. '''
  27. The liac-arff module implements functions to read and write ARFF files in
  28. Python. It was created in the Connectionist Artificial Intelligence Laboratory
  29. (LIAC), which takes place at the Federal University of Rio Grande do Sul
  30. (UFRGS), in Brazil.
  31. ARFF (Attribute-Relation File Format) is an file format specially created for
  32. describe datasets which are commonly used for machine learning experiments and
  33. software. This file format was created to be used in Weka, the best
  34. representative software for machine learning automated experiments.
  35. An ARFF file can be divided into two sections: header and data. The Header
  36. describes the metadata of the dataset, including a general description of the
  37. dataset, its name and its attributes. The source below is an example of a
  38. header section in a XOR dataset::
  39. %
  40. % XOR Dataset
  41. %
  42. % Created by Renato Pereira
  43. % rppereira@inf.ufrgs.br
  44. % http://inf.ufrgs.br/~rppereira
  45. %
  46. %
  47. @RELATION XOR
  48. @ATTRIBUTE input1 REAL
  49. @ATTRIBUTE input2 REAL
  50. @ATTRIBUTE y REAL
  51. The Data section of an ARFF file describes the observations of the dataset, in
  52. the case of XOR dataset::
  53. @DATA
  54. 0.0,0.0,0.0
  55. 0.0,1.0,1.0
  56. 1.0,0.0,1.0
  57. 1.0,1.0,0.0
  58. %
  59. %
  60. %
  61. Notice that several lines are starting with an ``%`` symbol, denoting a
  62. comment, thus, lines with ``%`` at the beginning will be ignored, except by the
  63. description part at the beginning of the file. The declarations ``@RELATION``,
  64. ``@ATTRIBUTE``, and ``@DATA`` are all case insensitive and obligatory.
  65. For more information and details about the ARFF file description, consult
  66. http://www.cs.waikato.ac.nz/~ml/weka/arff.html
  67. ARFF Files in Python
  68. ~~~~~~~~~~~~~~~~~~~~
  69. This module uses built-ins python objects to represent a deserialized ARFF
  70. file. A dictionary is used as the container of the data and metadata of ARFF,
  71. and have the following keys:
  72. - **description**: (OPTIONAL) a string with the description of the dataset.
  73. - **relation**: (OBLIGATORY) a string with the name of the dataset.
  74. - **attributes**: (OBLIGATORY) a list of attributes with the following
  75. template::
  76. (attribute_name, attribute_type)
  77. the attribute_name is a string, and attribute_type must be an string
  78. or a list of strings.
  79. - **data**: (OBLIGATORY) a list of data instances. Each data instance must be
  80. a list with values, depending on the attributes.
  81. The above keys must follow the case which were described, i.e., the keys are
  82. case sensitive. The attribute type ``attribute_type`` must be one of these
  83. strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or
  84. ``STRING``. For nominal attributes, the ``atribute_type`` must be a list of
  85. strings.
  86. In this format, the XOR dataset presented above can be represented as a python
  87. object as::
  88. xor_dataset = {
  89. 'description': 'XOR Dataset',
  90. 'relation': 'XOR',
  91. 'attributes': [
  92. ('input1', 'REAL'),
  93. ('input2', 'REAL'),
  94. ('y', 'REAL'),
  95. ],
  96. 'data': [
  97. [0.0, 0.0, 0.0],
  98. [0.0, 1.0, 1.0],
  99. [1.0, 0.0, 1.0],
  100. [1.0, 1.0, 0.0]
  101. ]
  102. }
  103. Features
  104. ~~~~~~~~
  105. This module provides several features, including:
  106. - Read and write ARFF files using python built-in structures, such dictionaries
  107. and lists;
  108. - Supports `scipy.sparse.coo <http://docs.scipy
  109. .org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix>`_
  110. and lists of dictionaries as used by SVMLight
  111. - Supports the following attribute types: NUMERIC, REAL, INTEGER, STRING, and
  112. NOMINAL;
  113. - Has an interface similar to other built-in modules such as ``json``, or
  114. ``zipfile``;
  115. - Supports read and write the descriptions of files;
  116. - Supports missing values and names with spaces;
  117. - Supports unicode values and names;
  118. - Fully compatible with Python 2.7+, Python 3.5+, pypy and pypy3;
  119. - Under `MIT License <http://opensource.org/licenses/MIT>`_
  120. '''
  121. __author__ = 'Renato de Pontes Pereira, Matthias Feurer, Joel Nothman'
  122. __author_email__ = ('renato.ppontes@gmail.com, '
  123. 'feurerm@informatik.uni-freiburg.de, '
  124. 'joel.nothman@gmail.com')
  125. __version__ = '2.4.0'
  126. import re
  127. import csv
  128. from typing import TYPE_CHECKING
  129. from typing import Optional, List, Dict, Any, Iterator, Union, Tuple
  130. # CONSTANTS ===================================================================
  131. _SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING']
  132. _TK_DESCRIPTION = '%'
  133. _TK_COMMENT = '%'
  134. _TK_RELATION = '@RELATION'
  135. _TK_ATTRIBUTE = '@ATTRIBUTE'
  136. _TK_DATA = '@DATA'
  137. _RE_RELATION = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE)
  138. _RE_ATTRIBUTE = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE)
  139. _RE_QUOTE_CHARS = re.compile(r'["\'\\\s%,\000-\031]', re.UNICODE)
  140. _RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])|[\n\r\t\000-\031]')
  141. _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE)
  142. _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE)
  143. ArffDenseDataType = Iterator[List]
  144. ArffSparseDataType = Tuple[List, ...]
  145. if TYPE_CHECKING:
  146. # typing_extensions is available when mypy is installed
  147. from typing_extensions import TypedDict
  148. class ArffContainerType(TypedDict):
  149. description: str
  150. relation: str
  151. attributes: List
  152. data: Union[ArffDenseDataType, ArffSparseDataType]
  153. else:
  154. ArffContainerType = Dict[str, Any]
  155. def _build_re_values():
  156. quoted_re = r'''
  157. " # open quote followed by zero or more of:
  158. (?:
  159. (?<!\\) # no additional backslash
  160. (?:\\\\)* # maybe escaped backslashes
  161. \\" # escaped quote
  162. |
  163. \\[^"] # escaping a non-quote
  164. |
  165. [^"\\] # non-quote char
  166. )*
  167. " # close quote
  168. '''
  169. # a value is surrounded by " or by ' or contains no quotables
  170. value_re = r'''(?:
  171. %s| # a value may be surrounded by "
  172. %s| # or by '
  173. [^,\s"'{}]+ # or may contain no characters requiring quoting
  174. )''' % (quoted_re,
  175. quoted_re.replace('"', "'"))
  176. # This captures (value, error) groups. Because empty values are allowed,
  177. # we cannot just look for empty values to handle syntax errors.
  178. # We presume the line has had ',' prepended...
  179. dense = re.compile(r'''(?x)
  180. , # may follow ','
  181. \s*
  182. ((?=,)|$|{value_re}) # empty or value
  183. |
  184. (\S.*) # error
  185. '''.format(value_re=value_re))
  186. # This captures (key, value) groups and will have an empty key/value
  187. # in case of syntax errors.
  188. # It does not ensure that the line starts with '{' or ends with '}'.
  189. sparse = re.compile(r'''(?x)
  190. (?:^\s*\{|,) # may follow ',', or '{' at line start
  191. \s*
  192. (\d+) # attribute key
  193. \s+
  194. (%(value_re)s) # value
  195. |
  196. (?!}\s*$) # not an error if it's }$
  197. (?!^\s*{\s*}\s*$) # not an error if it's ^{}$
  198. \S.* # error
  199. ''' % {'value_re': value_re})
  200. return dense, sparse
  201. _RE_DENSE_VALUES, _RE_SPARSE_KEY_VALUES = _build_re_values()
  202. _ESCAPE_SUB_MAP = {
  203. '\\\\': '\\',
  204. '\\"': '"',
  205. "\\'": "'",
  206. '\\t': '\t',
  207. '\\n': '\n',
  208. '\\r': '\r',
  209. '\\b': '\b',
  210. '\\f': '\f',
  211. '\\%': '%',
  212. }
  213. _UNESCAPE_SUB_MAP = {chr(i): '\\%03o' % i for i in range(32)}
  214. _UNESCAPE_SUB_MAP.update({v: k for k, v in _ESCAPE_SUB_MAP.items()})
  215. _UNESCAPE_SUB_MAP[''] = '\\'
  216. _ESCAPE_SUB_MAP.update({'\\%d' % i: chr(i) for i in range(10)})
  217. def _escape_sub_callback(match):
  218. s = match.group()
  219. if len(s) == 2:
  220. try:
  221. return _ESCAPE_SUB_MAP[s]
  222. except KeyError:
  223. raise ValueError('Unsupported escape sequence: %s' % s)
  224. if s[1] == 'u':
  225. return chr(int(s[2:], 16))
  226. else:
  227. return chr(int(s[1:], 8))
  228. def _unquote(v):
  229. if v[:1] in ('"', "'"):
  230. return re.sub(r'\\([0-9]{1,3}|u[0-9a-f]{4}|.)', _escape_sub_callback,
  231. v[1:-1])
  232. elif v in ('?', ''):
  233. return None
  234. else:
  235. return v
  236. def _parse_values(s):
  237. '''(INTERNAL) Split a line into a list of values'''
  238. if not _RE_NONTRIVIAL_DATA.search(s):
  239. # Fast path for trivial cases (unfortunately we have to handle missing
  240. # values because of the empty string case :(.)
  241. return [None if s in ('?', '') else s
  242. for s in next(csv.reader([s]))]
  243. # _RE_DENSE_VALUES tokenizes despite quoting, whitespace, etc.
  244. values, errors = zip(*_RE_DENSE_VALUES.findall(',' + s))
  245. if not any(errors):
  246. return [_unquote(v) for v in values]
  247. if _RE_SPARSE_LINE.match(s):
  248. try:
  249. return {int(k): _unquote(v)
  250. for k, v in _RE_SPARSE_KEY_VALUES.findall(s)}
  251. except ValueError:
  252. # an ARFF syntax error in sparse data
  253. for match in _RE_SPARSE_KEY_VALUES.finditer(s):
  254. if not match.group(1):
  255. raise BadLayout('Error parsing %r' % match.group())
  256. raise BadLayout('Unknown parsing error')
  257. else:
  258. # an ARFF syntax error
  259. for match in _RE_DENSE_VALUES.finditer(s):
  260. if match.group(2):
  261. raise BadLayout('Error parsing %r' % match.group())
  262. raise BadLayout('Unknown parsing error')
  263. DENSE = 0 # Constant value representing a dense matrix
  264. COO = 1 # Constant value representing a sparse matrix in coordinate format
  265. LOD = 2 # Constant value representing a sparse matrix in list of
  266. # dictionaries format
  267. DENSE_GEN = 3 # Generator of dictionaries
  268. LOD_GEN = 4 # Generator of dictionaries
  269. _SUPPORTED_DATA_STRUCTURES = [DENSE, COO, LOD, DENSE_GEN, LOD_GEN]
  270. # EXCEPTIONS ==================================================================
  271. class ArffException(Exception):
  272. message: Optional[str] = None
  273. def __init__(self):
  274. self.line = -1
  275. def __str__(self):
  276. return self.message%self.line
  277. class BadRelationFormat(ArffException):
  278. '''Error raised when the relation declaration is in an invalid format.'''
  279. message = 'Bad @RELATION format, at line %d.'
  280. class BadAttributeFormat(ArffException):
  281. '''Error raised when some attribute declaration is in an invalid format.'''
  282. message = 'Bad @ATTRIBUTE format, at line %d.'
  283. class BadDataFormat(ArffException):
  284. '''Error raised when some data instance is in an invalid format.'''
  285. def __init__(self, value):
  286. super().__init__()
  287. self.message = (
  288. 'Bad @DATA instance format in line %d: ' +
  289. ('%s' % value)
  290. )
  291. class BadAttributeType(ArffException):
  292. '''Error raised when some invalid type is provided into the attribute
  293. declaration.'''
  294. message = 'Bad @ATTRIBUTE type, at line %d.'
  295. class BadAttributeName(ArffException):
  296. '''Error raised when an attribute name is provided twice the attribute
  297. declaration.'''
  298. def __init__(self, value, value2):
  299. super().__init__()
  300. self.message = (
  301. ('Bad @ATTRIBUTE name %s at line' % value) +
  302. ' %d, this name is already in use in line' +
  303. (' %d.' % value2)
  304. )
  305. class BadNominalValue(ArffException):
  306. '''Error raised when a value in used in some data instance but is not
  307. declared into it respective attribute declaration.'''
  308. def __init__(self, value):
  309. super().__init__()
  310. self.message = (
  311. ('Data value %s not found in nominal declaration, ' % value)
  312. + 'at line %d.'
  313. )
  314. class BadNominalFormatting(ArffException):
  315. '''Error raised when a nominal value with space is not properly quoted.'''
  316. def __init__(self, value):
  317. super().__init__()
  318. self.message = (
  319. ('Nominal data value "%s" not properly quoted in line ' % value) +
  320. '%d.'
  321. )
  322. class BadNumericalValue(ArffException):
  323. '''Error raised when and invalid numerical value is used in some data
  324. instance.'''
  325. message = 'Invalid numerical value, at line %d.'
  326. class BadStringValue(ArffException):
  327. '''Error raise when a string contains space but is not quoted.'''
  328. message = 'Invalid string value at line %d.'
  329. class BadLayout(ArffException):
  330. '''Error raised when the layout of the ARFF file has something wrong.'''
  331. message = 'Invalid layout of the ARFF file, at line %d.'
  332. def __init__(self, msg=''):
  333. super().__init__()
  334. if msg:
  335. self.message = BadLayout.message + ' ' + msg.replace('%', '%%')
  336. class BadObject(ArffException):
  337. '''Error raised when the object representing the ARFF file has something
  338. wrong.'''
  339. def __init__(self, msg='Invalid object.'):
  340. self.msg = msg
  341. def __str__(self):
  342. return '%s' % self.msg
  343. # =============================================================================
  344. # INTERNAL ====================================================================
  345. def _unescape_sub_callback(match):
  346. return _UNESCAPE_SUB_MAP[match.group()]
  347. def encode_string(s):
  348. if _RE_QUOTE_CHARS.search(s):
  349. return "'%s'" % _RE_ESCAPE_CHARS.sub(_unescape_sub_callback, s)
  350. return s
  351. class EncodedNominalConversor:
  352. def __init__(self, values):
  353. self.values = {v: i for i, v in enumerate(values)}
  354. self.values[0] = 0
  355. def __call__(self, value):
  356. try:
  357. return self.values[value]
  358. except KeyError:
  359. raise BadNominalValue(value)
  360. class NominalConversor:
  361. def __init__(self, values):
  362. self.values = set(values)
  363. self.zero_value = values[0]
  364. def __call__(self, value):
  365. if value not in self.values:
  366. if value == 0:
  367. # Sparse decode
  368. # See issue #52: nominals should take their first value when
  369. # unspecified in a sparse matrix. Naturally, this is consistent
  370. # with EncodedNominalConversor.
  371. return self.zero_value
  372. raise BadNominalValue(value)
  373. return str(value)
  374. class DenseGeneratorData:
  375. '''Internal helper class to allow for different matrix types without
  376. making the code a huge collection of if statements.'''
  377. def decode_rows(self, stream, conversors):
  378. for row in stream:
  379. values = _parse_values(row)
  380. if isinstance(values, dict):
  381. if values and max(values) >= len(conversors):
  382. raise BadDataFormat(row)
  383. # XXX: int 0 is used for implicit values, not '0'
  384. values = [values[i] if i in values else 0 for i in
  385. range(len(conversors))]
  386. else:
  387. if len(values) != len(conversors):
  388. raise BadDataFormat(row)
  389. yield self._decode_values(values, conversors)
  390. @staticmethod
  391. def _decode_values(values, conversors):
  392. try:
  393. values = [None if value is None else conversor(value)
  394. for conversor, value
  395. in zip(conversors, values)]
  396. except ValueError as exc:
  397. if 'float: ' in str(exc):
  398. raise BadNumericalValue()
  399. return values
  400. def encode_data(self, data, attributes):
  401. '''(INTERNAL) Encodes a line of data.
  402. Data instances follow the csv format, i.e, attribute values are
  403. delimited by commas. After converted from csv.
  404. :param data: a list of values.
  405. :param attributes: a list of attributes. Used to check if data is valid.
  406. :return: a string with the encoded data line.
  407. '''
  408. current_row = 0
  409. for inst in data:
  410. if len(inst) != len(attributes):
  411. raise BadObject(
  412. 'Instance %d has %d attributes, expected %d' %
  413. (current_row, len(inst), len(attributes))
  414. )
  415. new_data = []
  416. for value in inst:
  417. if value is None or value == '' or value != value:
  418. s = '?'
  419. else:
  420. s = encode_string(str(value))
  421. new_data.append(s)
  422. current_row += 1
  423. yield ','.join(new_data)
  424. class _DataListMixin:
  425. """Mixin to return a list from decode_rows instead of a generator"""
  426. def decode_rows(self, stream, conversors):
  427. return list(super().decode_rows(stream, conversors))
  428. class Data(_DataListMixin, DenseGeneratorData):
  429. pass
  430. class COOData:
  431. def decode_rows(self, stream, conversors):
  432. data, rows, cols = [], [], []
  433. for i, row in enumerate(stream):
  434. values = _parse_values(row)
  435. if not isinstance(values, dict):
  436. raise BadLayout()
  437. if not values:
  438. continue
  439. row_cols, values = zip(*sorted(values.items()))
  440. try:
  441. values = [value if value is None else conversors[key](value)
  442. for key, value in zip(row_cols, values)]
  443. except ValueError as exc:
  444. if 'float: ' in str(exc):
  445. raise BadNumericalValue()
  446. raise
  447. except IndexError:
  448. # conversor out of range
  449. raise BadDataFormat(row)
  450. data.extend(values)
  451. rows.extend([i] * len(values))
  452. cols.extend(row_cols)
  453. return data, rows, cols
  454. def encode_data(self, data, attributes):
  455. num_attributes = len(attributes)
  456. new_data = []
  457. current_row = 0
  458. row = data.row
  459. col = data.col
  460. data = data.data
  461. # Check if the rows are sorted
  462. if not all(row[i] <= row[i + 1] for i in range(len(row) - 1)):
  463. raise ValueError("liac-arff can only output COO matrices with "
  464. "sorted rows.")
  465. for v, col, row in zip(data, col, row):
  466. if row > current_row:
  467. # Add empty rows if necessary
  468. while current_row < row:
  469. yield " ".join(["{", ','.join(new_data), "}"])
  470. new_data = []
  471. current_row += 1
  472. if col >= num_attributes:
  473. raise BadObject(
  474. 'Instance %d has at least %d attributes, expected %d' %
  475. (current_row, col + 1, num_attributes)
  476. )
  477. if v is None or v == '' or v != v:
  478. s = '?'
  479. else:
  480. s = encode_string(str(v))
  481. new_data.append("%d %s" % (col, s))
  482. yield " ".join(["{", ','.join(new_data), "}"])
  483. class LODGeneratorData:
  484. def decode_rows(self, stream, conversors):
  485. for row in stream:
  486. values = _parse_values(row)
  487. if not isinstance(values, dict):
  488. raise BadLayout()
  489. try:
  490. yield {key: None if value is None else conversors[key](value)
  491. for key, value in values.items()}
  492. except ValueError as exc:
  493. if 'float: ' in str(exc):
  494. raise BadNumericalValue()
  495. raise
  496. except IndexError:
  497. # conversor out of range
  498. raise BadDataFormat(row)
  499. def encode_data(self, data, attributes):
  500. current_row = 0
  501. num_attributes = len(attributes)
  502. for row in data:
  503. new_data = []
  504. if len(row) > 0 and max(row) >= num_attributes:
  505. raise BadObject(
  506. 'Instance %d has %d attributes, expected %d' %
  507. (current_row, max(row) + 1, num_attributes)
  508. )
  509. for col in sorted(row):
  510. v = row[col]
  511. if v is None or v == '' or v != v:
  512. s = '?'
  513. else:
  514. s = encode_string(str(v))
  515. new_data.append("%d %s" % (col, s))
  516. current_row += 1
  517. yield " ".join(["{", ','.join(new_data), "}"])
  518. class LODData(_DataListMixin, LODGeneratorData):
  519. pass
  520. def _get_data_object_for_decoding(matrix_type):
  521. if matrix_type == DENSE:
  522. return Data()
  523. elif matrix_type == COO:
  524. return COOData()
  525. elif matrix_type == LOD:
  526. return LODData()
  527. elif matrix_type == DENSE_GEN:
  528. return DenseGeneratorData()
  529. elif matrix_type == LOD_GEN:
  530. return LODGeneratorData()
  531. else:
  532. raise ValueError("Matrix type %s not supported." % str(matrix_type))
  533. def _get_data_object_for_encoding(matrix):
  534. # Probably a scipy.sparse
  535. if hasattr(matrix, 'format'):
  536. if matrix.format == 'coo':
  537. return COOData()
  538. else:
  539. raise ValueError('Cannot guess matrix format!')
  540. elif isinstance(matrix[0], dict):
  541. return LODData()
  542. else:
  543. return Data()
  544. # =============================================================================
  545. # ADVANCED INTERFACE ==========================================================
  546. class ArffDecoder:
  547. '''An ARFF decoder.'''
  548. def __init__(self):
  549. '''Constructor.'''
  550. self._conversors = []
  551. self._current_line = 0
  552. def _decode_comment(self, s):
  553. '''(INTERNAL) Decodes a comment line.
  554. Comments are single line strings starting, obligatorily, with the ``%``
  555. character, and can have any symbol, including whitespaces or special
  556. characters.
  557. This method must receive a normalized string, i.e., a string without
  558. padding, including the "\r\n" characters.
  559. :param s: a normalized string.
  560. :return: a string with the decoded comment.
  561. '''
  562. res = re.sub(r'^\%( )?', '', s)
  563. return res
  564. def _decode_relation(self, s):
  565. '''(INTERNAL) Decodes a relation line.
  566. The relation declaration is a line with the format ``@RELATION
  567. <relation-name>``, where ``relation-name`` is a string. The string must
  568. start with alphabetic character and must be quoted if the name includes
  569. spaces, otherwise this method will raise a `BadRelationFormat` exception.
  570. This method must receive a normalized string, i.e., a string without
  571. padding, including the "\r\n" characters.
  572. :param s: a normalized string.
  573. :return: a string with the decoded relation name.
  574. '''
  575. _, v = s.split(' ', 1)
  576. v = v.strip()
  577. if not _RE_RELATION.match(v):
  578. raise BadRelationFormat()
  579. res = str(v.strip('"\''))
  580. return res
  581. def _decode_attribute(self, s):
  582. '''(INTERNAL) Decodes an attribute line.
  583. The attribute is the most complex declaration in an arff file. All
  584. attributes must follow the template::
  585. @attribute <attribute-name> <datatype>
  586. where ``attribute-name`` is a string, quoted if the name contains any
  587. whitespace, and ``datatype`` can be:
  588. - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.
  589. - Strings as ``STRING``.
  590. - Dates (NOT IMPLEMENTED).
  591. - Nominal attributes with format:
  592. {<nominal-name1>, <nominal-name2>, <nominal-name3>, ...}
  593. The nominal names follow the rules for the attribute names, i.e., they
  594. must be quoted if the name contains whitespaces.
  595. This method must receive a normalized string, i.e., a string without
  596. padding, including the "\r\n" characters.
  597. :param s: a normalized string.
  598. :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).
  599. '''
  600. _, v = s.split(' ', 1)
  601. v = v.strip()
  602. # Verify the general structure of declaration
  603. m = _RE_ATTRIBUTE.match(v)
  604. if not m:
  605. raise BadAttributeFormat()
  606. # Extracts the raw name and type
  607. name, type_ = m.groups()
  608. # Extracts the final name
  609. name = str(name.strip('"\''))
  610. # Extracts the final type
  611. if type_[:1] == "{" and type_[-1:] == "}":
  612. try:
  613. type_ = _parse_values(type_.strip('{} '))
  614. except Exception:
  615. raise BadAttributeType()
  616. if isinstance(type_, dict):
  617. raise BadAttributeType()
  618. else:
  619. # If not nominal, verify the type name
  620. type_ = str(type_).upper()
  621. if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']:
  622. raise BadAttributeType()
  623. return (name, type_)
  624. def _decode(self, s, encode_nominal=False, matrix_type=DENSE):
  625. '''Do the job the ``encode``.'''
  626. # Make sure this method is idempotent
  627. self._current_line = 0
  628. # If string, convert to a list of lines
  629. if isinstance(s, str):
  630. s = s.strip('\r\n ').replace('\r\n', '\n').split('\n')
  631. # Create the return object
  632. obj: ArffContainerType = {
  633. 'description': '',
  634. 'relation': '',
  635. 'attributes': [],
  636. 'data': []
  637. }
  638. attribute_names = {}
  639. # Create the data helper object
  640. data = _get_data_object_for_decoding(matrix_type)
  641. # Read all lines
  642. STATE = _TK_DESCRIPTION
  643. s = iter(s)
  644. for row in s:
  645. self._current_line += 1
  646. # Ignore empty lines
  647. row = row.strip(' \r\n')
  648. if not row: continue
  649. u_row = row.upper()
  650. # DESCRIPTION -----------------------------------------------------
  651. if u_row.startswith(_TK_DESCRIPTION) and STATE == _TK_DESCRIPTION:
  652. obj['description'] += self._decode_comment(row) + '\n'
  653. # -----------------------------------------------------------------
  654. # RELATION --------------------------------------------------------
  655. elif u_row.startswith(_TK_RELATION):
  656. if STATE != _TK_DESCRIPTION:
  657. raise BadLayout()
  658. STATE = _TK_RELATION
  659. obj['relation'] = self._decode_relation(row)
  660. # -----------------------------------------------------------------
  661. # ATTRIBUTE -------------------------------------------------------
  662. elif u_row.startswith(_TK_ATTRIBUTE):
  663. if STATE != _TK_RELATION and STATE != _TK_ATTRIBUTE:
  664. raise BadLayout()
  665. STATE = _TK_ATTRIBUTE
  666. attr = self._decode_attribute(row)
  667. if attr[0] in attribute_names:
  668. raise BadAttributeName(attr[0], attribute_names[attr[0]])
  669. else:
  670. attribute_names[attr[0]] = self._current_line
  671. obj['attributes'].append(attr)
  672. if isinstance(attr[1], (list, tuple)):
  673. if encode_nominal:
  674. conversor = EncodedNominalConversor(attr[1])
  675. else:
  676. conversor = NominalConversor(attr[1])
  677. else:
  678. CONVERSOR_MAP = {'STRING': str,
  679. 'INTEGER': lambda x: int(float(x)),
  680. 'NUMERIC': float,
  681. 'REAL': float}
  682. conversor = CONVERSOR_MAP[attr[1]]
  683. self._conversors.append(conversor)
  684. # -----------------------------------------------------------------
  685. # DATA ------------------------------------------------------------
  686. elif u_row.startswith(_TK_DATA):
  687. if STATE != _TK_ATTRIBUTE:
  688. raise BadLayout()
  689. break
  690. # -----------------------------------------------------------------
  691. # COMMENT ---------------------------------------------------------
  692. elif u_row.startswith(_TK_COMMENT):
  693. pass
  694. # -----------------------------------------------------------------
  695. else:
  696. # Never found @DATA
  697. raise BadLayout()
  698. def stream():
  699. for row in s:
  700. self._current_line += 1
  701. row = row.strip()
  702. # Ignore empty lines and comment lines.
  703. if row and not row.startswith(_TK_COMMENT):
  704. yield row
  705. # Alter the data object
  706. obj['data'] = data.decode_rows(stream(), self._conversors)
  707. if obj['description'].endswith('\n'):
  708. obj['description'] = obj['description'][:-1]
  709. return obj
  710. def decode(self, s, encode_nominal=False, return_type=DENSE):
  711. '''Returns the Python representation of a given ARFF file.
  712. When a file object is passed as an argument, this method reads lines
  713. iteratively, avoiding to load unnecessary information to the memory.
  714. :param s: a string or file object with the ARFF file.
  715. :param encode_nominal: boolean, if True perform a label encoding
  716. while reading the .arff file.
  717. :param return_type: determines the data structure used to store the
  718. dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,
  719. `arff.DENSE_GEN` or `arff.LOD_GEN`.
  720. Consult the sections on `working with sparse data`_ and `loading
  721. progressively`_.
  722. '''
  723. try:
  724. return self._decode(s, encode_nominal=encode_nominal,
  725. matrix_type=return_type)
  726. except ArffException as e:
  727. e.line = self._current_line
  728. raise e
  729. class ArffEncoder:
  730. '''An ARFF encoder.'''
  731. def _encode_comment(self, s=''):
  732. '''(INTERNAL) Encodes a comment line.
  733. Comments are single line strings starting, obligatorily, with the ``%``
  734. character, and can have any symbol, including whitespaces or special
  735. characters.
  736. If ``s`` is None, this method will simply return an empty comment.
  737. :param s: (OPTIONAL) string.
  738. :return: a string with the encoded comment line.
  739. '''
  740. if s:
  741. return '%s %s'%(_TK_COMMENT, s)
  742. else:
  743. return '%s' % _TK_COMMENT
  744. def _encode_relation(self, name):
  745. '''(INTERNAL) Decodes a relation line.
  746. The relation declaration is a line with the format ``@RELATION
  747. <relation-name>``, where ``relation-name`` is a string.
  748. :param name: a string.
  749. :return: a string with the encoded relation declaration.
  750. '''
  751. for char in ' %{},':
  752. if char in name:
  753. name = '"%s"'%name
  754. break
  755. return '%s %s'%(_TK_RELATION, name)
  756. def _encode_attribute(self, name, type_):
  757. '''(INTERNAL) Encodes an attribute line.
  758. The attribute follow the template::
  759. @attribute <attribute-name> <datatype>
  760. where ``attribute-name`` is a string, and ``datatype`` can be:
  761. - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.
  762. - Strings as ``STRING``.
  763. - Dates (NOT IMPLEMENTED).
  764. - Nominal attributes with format:
  765. {<nominal-name1>, <nominal-name2>, <nominal-name3>, ...}
  766. This method must receive a the name of the attribute and its type, if
  767. the attribute type is nominal, ``type`` must be a list of values.
  768. :param name: a string.
  769. :param type_: a string or a list of string.
  770. :return: a string with the encoded attribute declaration.
  771. '''
  772. for char in ' %{},':
  773. if char in name:
  774. name = '"%s"'%name
  775. break
  776. if isinstance(type_, (tuple, list)):
  777. type_tmp = ['%s' % encode_string(type_k) for type_k in type_]
  778. type_ = '{%s}'%(', '.join(type_tmp))
  779. return '%s %s %s'%(_TK_ATTRIBUTE, name, type_)
  780. def encode(self, obj):
  781. '''Encodes a given object to an ARFF file.
  782. :param obj: the object containing the ARFF information.
  783. :return: the ARFF file as an string.
  784. '''
  785. data = [row for row in self.iter_encode(obj)]
  786. return '\n'.join(data)
  787. def iter_encode(self, obj):
  788. '''The iterative version of `arff.ArffEncoder.encode`.
  789. This encodes iteratively a given object and return, one-by-one, the
  790. lines of the ARFF file.
  791. :param obj: the object containing the ARFF information.
  792. :return: (yields) the ARFF file as strings.
  793. '''
  794. # DESCRIPTION
  795. if obj.get('description', None):
  796. for row in obj['description'].split('\n'):
  797. yield self._encode_comment(row)
  798. # RELATION
  799. if not obj.get('relation'):
  800. raise BadObject('Relation name not found or with invalid value.')
  801. yield self._encode_relation(obj['relation'])
  802. yield ''
  803. # ATTRIBUTES
  804. if not obj.get('attributes'):
  805. raise BadObject('Attributes not found.')
  806. attribute_names = set()
  807. for attr in obj['attributes']:
  808. # Verify for bad object format
  809. if not isinstance(attr, (tuple, list)) or \
  810. len(attr) != 2 or \
  811. not isinstance(attr[0], str):
  812. raise BadObject('Invalid attribute declaration "%s"'%str(attr))
  813. if isinstance(attr[1], str):
  814. # Verify for invalid types
  815. if attr[1] not in _SIMPLE_TYPES:
  816. raise BadObject('Invalid attribute type "%s"'%str(attr))
  817. # Verify for bad object format
  818. elif not isinstance(attr[1], (tuple, list)):
  819. raise BadObject('Invalid attribute type "%s"'%str(attr))
  820. # Verify attribute name is not used twice
  821. if attr[0] in attribute_names:
  822. raise BadObject('Trying to use attribute name "%s" for the '
  823. 'second time.' % str(attr[0]))
  824. else:
  825. attribute_names.add(attr[0])
  826. yield self._encode_attribute(attr[0], attr[1])
  827. yield ''
  828. attributes = obj['attributes']
  829. # DATA
  830. yield _TK_DATA
  831. if 'data' in obj:
  832. data = _get_data_object_for_encoding(obj.get('data'))
  833. yield from data.encode_data(obj.get('data'), attributes)
  834. yield ''
  835. # =============================================================================
  836. # BASIC INTERFACE =============================================================
  837. def load(fp, encode_nominal=False, return_type=DENSE):
  838. '''Load a file-like object containing the ARFF document and convert it into
  839. a Python object.
  840. :param fp: a file-like object.
  841. :param encode_nominal: boolean, if True perform a label encoding
  842. while reading the .arff file.
  843. :param return_type: determines the data structure used to store the
  844. dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,
  845. `arff.DENSE_GEN` or `arff.LOD_GEN`.
  846. Consult the sections on `working with sparse data`_ and `loading
  847. progressively`_.
  848. :return: a dictionary.
  849. '''
  850. decoder = ArffDecoder()
  851. return decoder.decode(fp, encode_nominal=encode_nominal,
  852. return_type=return_type)
  853. def loads(s, encode_nominal=False, return_type=DENSE):
  854. '''Convert a string instance containing the ARFF document into a Python
  855. object.
  856. :param s: a string object.
  857. :param encode_nominal: boolean, if True perform a label encoding
  858. while reading the .arff file.
  859. :param return_type: determines the data structure used to store the
  860. dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,
  861. `arff.DENSE_GEN` or `arff.LOD_GEN`.
  862. Consult the sections on `working with sparse data`_ and `loading
  863. progressively`_.
  864. :return: a dictionary.
  865. '''
  866. decoder = ArffDecoder()
  867. return decoder.decode(s, encode_nominal=encode_nominal,
  868. return_type=return_type)
  869. def dump(obj, fp):
  870. '''Serialize an object representing the ARFF document to a given file-like
  871. object.
  872. :param obj: a dictionary.
  873. :param fp: a file-like object.
  874. '''
  875. encoder = ArffEncoder()
  876. generator = encoder.iter_encode(obj)
  877. last_row = next(generator)
  878. for row in generator:
  879. fp.write(last_row + '\n')
  880. last_row = row
  881. fp.write(last_row)
  882. return fp
  883. def dumps(obj):
  884. '''Serialize an object representing the ARFF document, returning a string.
  885. :param obj: a dictionary.
  886. :return: a string with the ARFF document.
  887. '''
  888. encoder = ArffEncoder()
  889. return encoder.encode(obj)
  890. # =============================================================================