_arff_parser.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. """Implementation of ARFF parsers: via LIAC-ARFF and pandas."""
  2. import itertools
  3. import re
  4. from collections import OrderedDict
  5. from collections.abc import Generator
  6. from typing import List
  7. import numpy as np
  8. import scipy as sp
  9. from ..externals import _arff
  10. from ..externals._arff import ArffSparseDataType
  11. from ..utils import (
  12. _chunk_generator,
  13. check_pandas_support,
  14. get_chunk_n_rows,
  15. )
  16. def _split_sparse_columns(
  17. arff_data: ArffSparseDataType, include_columns: List
  18. ) -> ArffSparseDataType:
  19. """Obtains several columns from sparse ARFF representation. Additionally,
  20. the column indices are re-labelled, given the columns that are not
  21. included. (e.g., when including [1, 2, 3], the columns will be relabelled
  22. to [0, 1, 2]).
  23. Parameters
  24. ----------
  25. arff_data : tuple
  26. A tuple of three lists of equal size; first list indicating the value,
  27. second the x coordinate and the third the y coordinate.
  28. include_columns : list
  29. A list of columns to include.
  30. Returns
  31. -------
  32. arff_data_new : tuple
  33. Subset of arff data with only the include columns indicated by the
  34. include_columns argument.
  35. """
  36. arff_data_new: ArffSparseDataType = (list(), list(), list())
  37. reindexed_columns = {
  38. column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
  39. }
  40. for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
  41. if col_idx in include_columns:
  42. arff_data_new[0].append(val)
  43. arff_data_new[1].append(row_idx)
  44. arff_data_new[2].append(reindexed_columns[col_idx])
  45. return arff_data_new
  46. def _sparse_data_to_array(
  47. arff_data: ArffSparseDataType, include_columns: List
  48. ) -> np.ndarray:
  49. # turns the sparse data back into an array (can't use toarray() function,
  50. # as this does only work on numeric data)
  51. num_obs = max(arff_data[1]) + 1
  52. y_shape = (num_obs, len(include_columns))
  53. reindexed_columns = {
  54. column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
  55. }
  56. # TODO: improve for efficiency
  57. y = np.empty(y_shape, dtype=np.float64)
  58. for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
  59. if col_idx in include_columns:
  60. y[row_idx, reindexed_columns[col_idx]] = val
  61. return y
  62. def _post_process_frame(frame, feature_names, target_names):
  63. """Post process a dataframe to select the desired columns in `X` and `y`.
  64. Parameters
  65. ----------
  66. frame : dataframe
  67. The dataframe to split into `X` and `y`.
  68. feature_names : list of str
  69. The list of feature names to populate `X`.
  70. target_names : list of str
  71. The list of target names to populate `y`.
  72. Returns
  73. -------
  74. X : dataframe
  75. The dataframe containing the features.
  76. y : {series, dataframe} or None
  77. The series or dataframe containing the target.
  78. """
  79. X = frame[feature_names]
  80. if len(target_names) >= 2:
  81. y = frame[target_names]
  82. elif len(target_names) == 1:
  83. y = frame[target_names[0]]
  84. else:
  85. y = None
  86. return X, y
  87. def _liac_arff_parser(
  88. gzip_file,
  89. output_arrays_type,
  90. openml_columns_info,
  91. feature_names_to_select,
  92. target_names_to_select,
  93. shape=None,
  94. ):
  95. """ARFF parser using the LIAC-ARFF library coded purely in Python.
  96. This parser is quite slow but consumes a generator. Currently it is needed
  97. to parse sparse datasets. For dense datasets, it is recommended to instead
  98. use the pandas-based parser, although it does not always handles the
  99. dtypes exactly the same.
  100. Parameters
  101. ----------
  102. gzip_file : GzipFile instance
  103. The file compressed to be read.
  104. output_arrays_type : {"numpy", "sparse", "pandas"}
  105. The type of the arrays that will be returned. The possibilities ara:
  106. - `"numpy"`: both `X` and `y` will be NumPy arrays;
  107. - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
  108. - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
  109. pandas Series or DataFrame.
  110. columns_info : dict
  111. The information provided by OpenML regarding the columns of the ARFF
  112. file.
  113. feature_names_to_select : list of str
  114. A list of the feature names to be selected.
  115. target_names_to_select : list of str
  116. A list of the target names to be selected.
  117. Returns
  118. -------
  119. X : {ndarray, sparse matrix, dataframe}
  120. The data matrix.
  121. y : {ndarray, dataframe, series}
  122. The target.
  123. frame : dataframe or None
  124. A dataframe containing both `X` and `y`. `None` if
  125. `output_array_type != "pandas"`.
  126. categories : list of str or None
  127. The names of the features that are categorical. `None` if
  128. `output_array_type == "pandas"`.
  129. """
  130. def _io_to_generator(gzip_file):
  131. for line in gzip_file:
  132. yield line.decode("utf-8")
  133. stream = _io_to_generator(gzip_file)
  134. # find which type (dense or sparse) ARFF type we will have to deal with
  135. return_type = _arff.COO if output_arrays_type == "sparse" else _arff.DENSE_GEN
  136. # we should not let LIAC-ARFF to encode the nominal attributes with NumPy
  137. # arrays to have only numerical values.
  138. encode_nominal = not (output_arrays_type == "pandas")
  139. arff_container = _arff.load(
  140. stream, return_type=return_type, encode_nominal=encode_nominal
  141. )
  142. columns_to_select = feature_names_to_select + target_names_to_select
  143. categories = {
  144. name: cat
  145. for name, cat in arff_container["attributes"]
  146. if isinstance(cat, list) and name in columns_to_select
  147. }
  148. if output_arrays_type == "pandas":
  149. pd = check_pandas_support("fetch_openml with as_frame=True")
  150. columns_info = OrderedDict(arff_container["attributes"])
  151. columns_names = list(columns_info.keys())
  152. # calculate chunksize
  153. first_row = next(arff_container["data"])
  154. first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)
  155. row_bytes = first_df.memory_usage(deep=True).sum()
  156. chunksize = get_chunk_n_rows(row_bytes)
  157. # read arff data with chunks
  158. columns_to_keep = [col for col in columns_names if col in columns_to_select]
  159. dfs = [first_df[columns_to_keep]]
  160. for data in _chunk_generator(arff_container["data"], chunksize):
  161. dfs.append(
  162. pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
  163. )
  164. # dfs[0] contains only one row, which may not have enough data to infer to
  165. # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0]
  166. if len(dfs) >= 2:
  167. dfs[0] = dfs[0].astype(dfs[1].dtypes)
  168. # liac-arff parser does not depend on NumPy and uses None to represent
  169. # missing values. To be consistent with the pandas parser, we replace
  170. # None with np.nan.
  171. frame = pd.concat(dfs, ignore_index=True).fillna(value=np.nan)
  172. del dfs, first_df
  173. # cast the columns frame
  174. dtypes = {}
  175. for name in frame.columns:
  176. column_dtype = openml_columns_info[name]["data_type"]
  177. if column_dtype.lower() == "integer":
  178. # Use a pandas extension array instead of np.int64 to be able
  179. # to support missing values.
  180. dtypes[name] = "Int64"
  181. elif column_dtype.lower() == "nominal":
  182. dtypes[name] = "category"
  183. else:
  184. dtypes[name] = frame.dtypes[name]
  185. frame = frame.astype(dtypes)
  186. X, y = _post_process_frame(
  187. frame, feature_names_to_select, target_names_to_select
  188. )
  189. else:
  190. arff_data = arff_container["data"]
  191. feature_indices_to_select = [
  192. int(openml_columns_info[col_name]["index"])
  193. for col_name in feature_names_to_select
  194. ]
  195. target_indices_to_select = [
  196. int(openml_columns_info[col_name]["index"])
  197. for col_name in target_names_to_select
  198. ]
  199. if isinstance(arff_data, Generator):
  200. if shape is None:
  201. raise ValueError(
  202. "shape must be provided when arr['data'] is a Generator"
  203. )
  204. if shape[0] == -1:
  205. count = -1
  206. else:
  207. count = shape[0] * shape[1]
  208. data = np.fromiter(
  209. itertools.chain.from_iterable(arff_data),
  210. dtype="float64",
  211. count=count,
  212. )
  213. data = data.reshape(*shape)
  214. X = data[:, feature_indices_to_select]
  215. y = data[:, target_indices_to_select]
  216. elif isinstance(arff_data, tuple):
  217. arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select)
  218. num_obs = max(arff_data[1]) + 1
  219. X_shape = (num_obs, len(feature_indices_to_select))
  220. X = sp.sparse.coo_matrix(
  221. (arff_data_X[0], (arff_data_X[1], arff_data_X[2])),
  222. shape=X_shape,
  223. dtype=np.float64,
  224. )
  225. X = X.tocsr()
  226. y = _sparse_data_to_array(arff_data, target_indices_to_select)
  227. else:
  228. # This should never happen
  229. raise ValueError(
  230. f"Unexpected type for data obtained from arff: {type(arff_data)}"
  231. )
  232. is_classification = {
  233. col_name in categories for col_name in target_names_to_select
  234. }
  235. if not is_classification:
  236. # No target
  237. pass
  238. elif all(is_classification):
  239. y = np.hstack(
  240. [
  241. np.take(
  242. np.asarray(categories.pop(col_name), dtype="O"),
  243. y[:, i : i + 1].astype(int, copy=False),
  244. )
  245. for i, col_name in enumerate(target_names_to_select)
  246. ]
  247. )
  248. elif any(is_classification):
  249. raise ValueError(
  250. "Mix of nominal and non-nominal targets is not currently supported"
  251. )
  252. # reshape y back to 1-D array, if there is only 1 target column;
  253. # back to None if there are not target columns
  254. if y.shape[1] == 1:
  255. y = y.reshape((-1,))
  256. elif y.shape[1] == 0:
  257. y = None
  258. if output_arrays_type == "pandas":
  259. return X, y, frame, None
  260. return X, y, None, categories
  261. def _pandas_arff_parser(
  262. gzip_file,
  263. output_arrays_type,
  264. openml_columns_info,
  265. feature_names_to_select,
  266. target_names_to_select,
  267. read_csv_kwargs=None,
  268. ):
  269. """ARFF parser using `pandas.read_csv`.
  270. This parser uses the metadata fetched directly from OpenML and skips the metadata
  271. headers of ARFF file itself. The data is loaded as a CSV file.
  272. Parameters
  273. ----------
  274. gzip_file : GzipFile instance
  275. The GZip compressed file with the ARFF formatted payload.
  276. output_arrays_type : {"numpy", "sparse", "pandas"}
  277. The type of the arrays that will be returned. The possibilities are:
  278. - `"numpy"`: both `X` and `y` will be NumPy arrays;
  279. - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
  280. - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
  281. pandas Series or DataFrame.
  282. openml_columns_info : dict
  283. The information provided by OpenML regarding the columns of the ARFF
  284. file.
  285. feature_names_to_select : list of str
  286. A list of the feature names to be selected to build `X`.
  287. target_names_to_select : list of str
  288. A list of the target names to be selected to build `y`.
  289. read_csv_kwargs : dict, default=None
  290. Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
  291. the default options.
  292. Returns
  293. -------
  294. X : {ndarray, sparse matrix, dataframe}
  295. The data matrix.
  296. y : {ndarray, dataframe, series}
  297. The target.
  298. frame : dataframe or None
  299. A dataframe containing both `X` and `y`. `None` if
  300. `output_array_type != "pandas"`.
  301. categories : list of str or None
  302. The names of the features that are categorical. `None` if
  303. `output_array_type == "pandas"`.
  304. """
  305. import pandas as pd
  306. # read the file until the data section to skip the ARFF metadata headers
  307. for line in gzip_file:
  308. if line.decode("utf-8").lower().startswith("@data"):
  309. break
  310. dtypes = {}
  311. for name in openml_columns_info:
  312. column_dtype = openml_columns_info[name]["data_type"]
  313. if column_dtype.lower() == "integer":
  314. # Use Int64 to infer missing values from data
  315. # XXX: this line is not covered by our tests. Is this really needed?
  316. dtypes[name] = "Int64"
  317. elif column_dtype.lower() == "nominal":
  318. dtypes[name] = "category"
  319. # since we will not pass `names` when reading the ARFF file, we need to translate
  320. # `dtypes` from column names to column indices to pass to `pandas.read_csv`
  321. dtypes_positional = {
  322. col_idx: dtypes[name]
  323. for col_idx, name in enumerate(openml_columns_info)
  324. if name in dtypes
  325. }
  326. default_read_csv_kwargs = {
  327. "header": None,
  328. "index_col": False, # always force pandas to not use the first column as index
  329. "na_values": ["?"], # missing values are represented by `?`
  330. "keep_default_na": False, # only `?` is a missing value given the ARFF specs
  331. "comment": "%", # skip line starting by `%` since they are comments
  332. "quotechar": '"', # delimiter to use for quoted strings
  333. "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs
  334. "escapechar": "\\",
  335. "dtype": dtypes_positional,
  336. }
  337. read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
  338. frame = pd.read_csv(gzip_file, **read_csv_kwargs)
  339. try:
  340. # Setting the columns while reading the file will select the N first columns
  341. # and not raise a ParserError. Instead, we set the columns after reading the
  342. # file and raise a ParserError if the number of columns does not match the
  343. # number of columns in the metadata given by OpenML.
  344. frame.columns = [name for name in openml_columns_info]
  345. except ValueError as exc:
  346. raise pd.errors.ParserError(
  347. "The number of columns provided by OpenML does not match the number of "
  348. "columns inferred by pandas when reading the file."
  349. ) from exc
  350. columns_to_select = feature_names_to_select + target_names_to_select
  351. columns_to_keep = [col for col in frame.columns if col in columns_to_select]
  352. frame = frame[columns_to_keep]
  353. # `pd.read_csv` automatically handles double quotes for quoting non-numeric
  354. # CSV cell values. Contrary to LIAC-ARFF, `pd.read_csv` cannot be configured to
  355. # consider either single quotes and double quotes as valid quoting chars at
  356. # the same time since this case does not occur in regular (non-ARFF) CSV files.
  357. # To mimic the behavior of LIAC-ARFF parser, we manually strip single quotes
  358. # on categories as a post-processing steps if needed.
  359. #
  360. # Note however that we intentionally do not attempt to do this kind of manual
  361. # post-processing of (non-categorical) string-typed columns because we cannot
  362. # resolve the ambiguity of the case of CSV cell with nesting quoting such as
  363. # `"'some string value'"` with pandas.
  364. single_quote_pattern = re.compile(r"^'(?P<contents>.*)'$")
  365. def strip_single_quotes(input_string):
  366. match = re.search(single_quote_pattern, input_string)
  367. if match is None:
  368. return input_string
  369. return match.group("contents")
  370. categorical_columns = [
  371. name
  372. for name, dtype in frame.dtypes.items()
  373. if isinstance(dtype, pd.CategoricalDtype)
  374. ]
  375. for col in categorical_columns:
  376. frame[col] = frame[col].cat.rename_categories(strip_single_quotes)
  377. X, y = _post_process_frame(frame, feature_names_to_select, target_names_to_select)
  378. if output_arrays_type == "pandas":
  379. return X, y, frame, None
  380. else:
  381. X, y = X.to_numpy(), y.to_numpy()
  382. categories = {
  383. name: dtype.categories.tolist()
  384. for name, dtype in frame.dtypes.items()
  385. if isinstance(dtype, pd.CategoricalDtype)
  386. }
  387. return X, y, None, categories
  388. def load_arff_from_gzip_file(
  389. gzip_file,
  390. parser,
  391. output_type,
  392. openml_columns_info,
  393. feature_names_to_select,
  394. target_names_to_select,
  395. shape=None,
  396. read_csv_kwargs=None,
  397. ):
  398. """Load a compressed ARFF file using a given parser.
  399. Parameters
  400. ----------
  401. gzip_file : GzipFile instance
  402. The file compressed to be read.
  403. parser : {"pandas", "liac-arff"}
  404. The parser used to parse the ARFF file. "pandas" is recommended
  405. but only supports loading dense datasets.
  406. output_type : {"numpy", "sparse", "pandas"}
  407. The type of the arrays that will be returned. The possibilities ara:
  408. - `"numpy"`: both `X` and `y` will be NumPy arrays;
  409. - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
  410. - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
  411. pandas Series or DataFrame.
  412. openml_columns_info : dict
  413. The information provided by OpenML regarding the columns of the ARFF
  414. file.
  415. feature_names_to_select : list of str
  416. A list of the feature names to be selected.
  417. target_names_to_select : list of str
  418. A list of the target names to be selected.
  419. read_csv_kwargs : dict, default=None
  420. Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
  421. the default options.
  422. Returns
  423. -------
  424. X : {ndarray, sparse matrix, dataframe}
  425. The data matrix.
  426. y : {ndarray, dataframe, series}
  427. The target.
  428. frame : dataframe or None
  429. A dataframe containing both `X` and `y`. `None` if
  430. `output_array_type != "pandas"`.
  431. categories : list of str or None
  432. The names of the features that are categorical. `None` if
  433. `output_array_type == "pandas"`.
  434. """
  435. if parser == "liac-arff":
  436. return _liac_arff_parser(
  437. gzip_file,
  438. output_type,
  439. openml_columns_info,
  440. feature_names_to_select,
  441. target_names_to_select,
  442. shape,
  443. )
  444. elif parser == "pandas":
  445. return _pandas_arff_parser(
  446. gzip_file,
  447. output_type,
  448. openml_columns_info,
  449. feature_names_to_select,
  450. target_names_to_select,
  451. read_csv_kwargs,
  452. )
  453. else:
  454. raise ValueError(
  455. f"Unknown parser: '{parser}'. Should be 'liac-arff' or 'pandas'."
  456. )