| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540 |
- """Implementation of ARFF parsers: via LIAC-ARFF and pandas."""
- import itertools
- import re
- from collections import OrderedDict
- from collections.abc import Generator
- from typing import List
- import numpy as np
- import scipy as sp
- from ..externals import _arff
- from ..externals._arff import ArffSparseDataType
- from ..utils import (
- _chunk_generator,
- check_pandas_support,
- get_chunk_n_rows,
- )
- def _split_sparse_columns(
- arff_data: ArffSparseDataType, include_columns: List
- ) -> ArffSparseDataType:
- """Obtains several columns from sparse ARFF representation. Additionally,
- the column indices are re-labelled, given the columns that are not
- included. (e.g., when including [1, 2, 3], the columns will be relabelled
- to [0, 1, 2]).
- Parameters
- ----------
- arff_data : tuple
- A tuple of three lists of equal size; first list indicating the value,
- second the x coordinate and the third the y coordinate.
- include_columns : list
- A list of columns to include.
- Returns
- -------
- arff_data_new : tuple
- Subset of arff data with only the include columns indicated by the
- include_columns argument.
- """
- arff_data_new: ArffSparseDataType = (list(), list(), list())
- reindexed_columns = {
- column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
- }
- for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
- if col_idx in include_columns:
- arff_data_new[0].append(val)
- arff_data_new[1].append(row_idx)
- arff_data_new[2].append(reindexed_columns[col_idx])
- return arff_data_new
- def _sparse_data_to_array(
- arff_data: ArffSparseDataType, include_columns: List
- ) -> np.ndarray:
- # turns the sparse data back into an array (can't use toarray() function,
- # as this does only work on numeric data)
- num_obs = max(arff_data[1]) + 1
- y_shape = (num_obs, len(include_columns))
- reindexed_columns = {
- column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
- }
- # TODO: improve for efficiency
- y = np.empty(y_shape, dtype=np.float64)
- for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
- if col_idx in include_columns:
- y[row_idx, reindexed_columns[col_idx]] = val
- return y
- def _post_process_frame(frame, feature_names, target_names):
- """Post process a dataframe to select the desired columns in `X` and `y`.
- Parameters
- ----------
- frame : dataframe
- The dataframe to split into `X` and `y`.
- feature_names : list of str
- The list of feature names to populate `X`.
- target_names : list of str
- The list of target names to populate `y`.
- Returns
- -------
- X : dataframe
- The dataframe containing the features.
- y : {series, dataframe} or None
- The series or dataframe containing the target.
- """
- X = frame[feature_names]
- if len(target_names) >= 2:
- y = frame[target_names]
- elif len(target_names) == 1:
- y = frame[target_names[0]]
- else:
- y = None
- return X, y
- def _liac_arff_parser(
- gzip_file,
- output_arrays_type,
- openml_columns_info,
- feature_names_to_select,
- target_names_to_select,
- shape=None,
- ):
- """ARFF parser using the LIAC-ARFF library coded purely in Python.
- This parser is quite slow but consumes a generator. Currently it is needed
- to parse sparse datasets. For dense datasets, it is recommended to instead
- use the pandas-based parser, although it does not always handles the
- dtypes exactly the same.
- Parameters
- ----------
- gzip_file : GzipFile instance
- The file compressed to be read.
- output_arrays_type : {"numpy", "sparse", "pandas"}
- The type of the arrays that will be returned. The possibilities ara:
- - `"numpy"`: both `X` and `y` will be NumPy arrays;
- - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
- - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
- pandas Series or DataFrame.
- columns_info : dict
- The information provided by OpenML regarding the columns of the ARFF
- file.
- feature_names_to_select : list of str
- A list of the feature names to be selected.
- target_names_to_select : list of str
- A list of the target names to be selected.
- Returns
- -------
- X : {ndarray, sparse matrix, dataframe}
- The data matrix.
- y : {ndarray, dataframe, series}
- The target.
- frame : dataframe or None
- A dataframe containing both `X` and `y`. `None` if
- `output_array_type != "pandas"`.
- categories : list of str or None
- The names of the features that are categorical. `None` if
- `output_array_type == "pandas"`.
- """
- def _io_to_generator(gzip_file):
- for line in gzip_file:
- yield line.decode("utf-8")
- stream = _io_to_generator(gzip_file)
- # find which type (dense or sparse) ARFF type we will have to deal with
- return_type = _arff.COO if output_arrays_type == "sparse" else _arff.DENSE_GEN
- # we should not let LIAC-ARFF to encode the nominal attributes with NumPy
- # arrays to have only numerical values.
- encode_nominal = not (output_arrays_type == "pandas")
- arff_container = _arff.load(
- stream, return_type=return_type, encode_nominal=encode_nominal
- )
- columns_to_select = feature_names_to_select + target_names_to_select
- categories = {
- name: cat
- for name, cat in arff_container["attributes"]
- if isinstance(cat, list) and name in columns_to_select
- }
- if output_arrays_type == "pandas":
- pd = check_pandas_support("fetch_openml with as_frame=True")
- columns_info = OrderedDict(arff_container["attributes"])
- columns_names = list(columns_info.keys())
- # calculate chunksize
- first_row = next(arff_container["data"])
- first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)
- row_bytes = first_df.memory_usage(deep=True).sum()
- chunksize = get_chunk_n_rows(row_bytes)
- # read arff data with chunks
- columns_to_keep = [col for col in columns_names if col in columns_to_select]
- dfs = [first_df[columns_to_keep]]
- for data in _chunk_generator(arff_container["data"], chunksize):
- dfs.append(
- pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
- )
- # dfs[0] contains only one row, which may not have enough data to infer to
- # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0]
- if len(dfs) >= 2:
- dfs[0] = dfs[0].astype(dfs[1].dtypes)
- # liac-arff parser does not depend on NumPy and uses None to represent
- # missing values. To be consistent with the pandas parser, we replace
- # None with np.nan.
- frame = pd.concat(dfs, ignore_index=True).fillna(value=np.nan)
- del dfs, first_df
- # cast the columns frame
- dtypes = {}
- for name in frame.columns:
- column_dtype = openml_columns_info[name]["data_type"]
- if column_dtype.lower() == "integer":
- # Use a pandas extension array instead of np.int64 to be able
- # to support missing values.
- dtypes[name] = "Int64"
- elif column_dtype.lower() == "nominal":
- dtypes[name] = "category"
- else:
- dtypes[name] = frame.dtypes[name]
- frame = frame.astype(dtypes)
- X, y = _post_process_frame(
- frame, feature_names_to_select, target_names_to_select
- )
- else:
- arff_data = arff_container["data"]
- feature_indices_to_select = [
- int(openml_columns_info[col_name]["index"])
- for col_name in feature_names_to_select
- ]
- target_indices_to_select = [
- int(openml_columns_info[col_name]["index"])
- for col_name in target_names_to_select
- ]
- if isinstance(arff_data, Generator):
- if shape is None:
- raise ValueError(
- "shape must be provided when arr['data'] is a Generator"
- )
- if shape[0] == -1:
- count = -1
- else:
- count = shape[0] * shape[1]
- data = np.fromiter(
- itertools.chain.from_iterable(arff_data),
- dtype="float64",
- count=count,
- )
- data = data.reshape(*shape)
- X = data[:, feature_indices_to_select]
- y = data[:, target_indices_to_select]
- elif isinstance(arff_data, tuple):
- arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select)
- num_obs = max(arff_data[1]) + 1
- X_shape = (num_obs, len(feature_indices_to_select))
- X = sp.sparse.coo_matrix(
- (arff_data_X[0], (arff_data_X[1], arff_data_X[2])),
- shape=X_shape,
- dtype=np.float64,
- )
- X = X.tocsr()
- y = _sparse_data_to_array(arff_data, target_indices_to_select)
- else:
- # This should never happen
- raise ValueError(
- f"Unexpected type for data obtained from arff: {type(arff_data)}"
- )
- is_classification = {
- col_name in categories for col_name in target_names_to_select
- }
- if not is_classification:
- # No target
- pass
- elif all(is_classification):
- y = np.hstack(
- [
- np.take(
- np.asarray(categories.pop(col_name), dtype="O"),
- y[:, i : i + 1].astype(int, copy=False),
- )
- for i, col_name in enumerate(target_names_to_select)
- ]
- )
- elif any(is_classification):
- raise ValueError(
- "Mix of nominal and non-nominal targets is not currently supported"
- )
- # reshape y back to 1-D array, if there is only 1 target column;
- # back to None if there are not target columns
- if y.shape[1] == 1:
- y = y.reshape((-1,))
- elif y.shape[1] == 0:
- y = None
- if output_arrays_type == "pandas":
- return X, y, frame, None
- return X, y, None, categories
- def _pandas_arff_parser(
- gzip_file,
- output_arrays_type,
- openml_columns_info,
- feature_names_to_select,
- target_names_to_select,
- read_csv_kwargs=None,
- ):
- """ARFF parser using `pandas.read_csv`.
- This parser uses the metadata fetched directly from OpenML and skips the metadata
- headers of ARFF file itself. The data is loaded as a CSV file.
- Parameters
- ----------
- gzip_file : GzipFile instance
- The GZip compressed file with the ARFF formatted payload.
- output_arrays_type : {"numpy", "sparse", "pandas"}
- The type of the arrays that will be returned. The possibilities are:
- - `"numpy"`: both `X` and `y` will be NumPy arrays;
- - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
- - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
- pandas Series or DataFrame.
- openml_columns_info : dict
- The information provided by OpenML regarding the columns of the ARFF
- file.
- feature_names_to_select : list of str
- A list of the feature names to be selected to build `X`.
- target_names_to_select : list of str
- A list of the target names to be selected to build `y`.
- read_csv_kwargs : dict, default=None
- Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
- the default options.
- Returns
- -------
- X : {ndarray, sparse matrix, dataframe}
- The data matrix.
- y : {ndarray, dataframe, series}
- The target.
- frame : dataframe or None
- A dataframe containing both `X` and `y`. `None` if
- `output_array_type != "pandas"`.
- categories : list of str or None
- The names of the features that are categorical. `None` if
- `output_array_type == "pandas"`.
- """
- import pandas as pd
- # read the file until the data section to skip the ARFF metadata headers
- for line in gzip_file:
- if line.decode("utf-8").lower().startswith("@data"):
- break
- dtypes = {}
- for name in openml_columns_info:
- column_dtype = openml_columns_info[name]["data_type"]
- if column_dtype.lower() == "integer":
- # Use Int64 to infer missing values from data
- # XXX: this line is not covered by our tests. Is this really needed?
- dtypes[name] = "Int64"
- elif column_dtype.lower() == "nominal":
- dtypes[name] = "category"
- # since we will not pass `names` when reading the ARFF file, we need to translate
- # `dtypes` from column names to column indices to pass to `pandas.read_csv`
- dtypes_positional = {
- col_idx: dtypes[name]
- for col_idx, name in enumerate(openml_columns_info)
- if name in dtypes
- }
- default_read_csv_kwargs = {
- "header": None,
- "index_col": False, # always force pandas to not use the first column as index
- "na_values": ["?"], # missing values are represented by `?`
- "keep_default_na": False, # only `?` is a missing value given the ARFF specs
- "comment": "%", # skip line starting by `%` since they are comments
- "quotechar": '"', # delimiter to use for quoted strings
- "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs
- "escapechar": "\\",
- "dtype": dtypes_positional,
- }
- read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
- frame = pd.read_csv(gzip_file, **read_csv_kwargs)
- try:
- # Setting the columns while reading the file will select the N first columns
- # and not raise a ParserError. Instead, we set the columns after reading the
- # file and raise a ParserError if the number of columns does not match the
- # number of columns in the metadata given by OpenML.
- frame.columns = [name for name in openml_columns_info]
- except ValueError as exc:
- raise pd.errors.ParserError(
- "The number of columns provided by OpenML does not match the number of "
- "columns inferred by pandas when reading the file."
- ) from exc
- columns_to_select = feature_names_to_select + target_names_to_select
- columns_to_keep = [col for col in frame.columns if col in columns_to_select]
- frame = frame[columns_to_keep]
- # `pd.read_csv` automatically handles double quotes for quoting non-numeric
- # CSV cell values. Contrary to LIAC-ARFF, `pd.read_csv` cannot be configured to
- # consider either single quotes and double quotes as valid quoting chars at
- # the same time since this case does not occur in regular (non-ARFF) CSV files.
- # To mimic the behavior of LIAC-ARFF parser, we manually strip single quotes
- # on categories as a post-processing steps if needed.
- #
- # Note however that we intentionally do not attempt to do this kind of manual
- # post-processing of (non-categorical) string-typed columns because we cannot
- # resolve the ambiguity of the case of CSV cell with nesting quoting such as
- # `"'some string value'"` with pandas.
- single_quote_pattern = re.compile(r"^'(?P<contents>.*)'$")
- def strip_single_quotes(input_string):
- match = re.search(single_quote_pattern, input_string)
- if match is None:
- return input_string
- return match.group("contents")
- categorical_columns = [
- name
- for name, dtype in frame.dtypes.items()
- if isinstance(dtype, pd.CategoricalDtype)
- ]
- for col in categorical_columns:
- frame[col] = frame[col].cat.rename_categories(strip_single_quotes)
- X, y = _post_process_frame(frame, feature_names_to_select, target_names_to_select)
- if output_arrays_type == "pandas":
- return X, y, frame, None
- else:
- X, y = X.to_numpy(), y.to_numpy()
- categories = {
- name: dtype.categories.tolist()
- for name, dtype in frame.dtypes.items()
- if isinstance(dtype, pd.CategoricalDtype)
- }
- return X, y, None, categories
- def load_arff_from_gzip_file(
- gzip_file,
- parser,
- output_type,
- openml_columns_info,
- feature_names_to_select,
- target_names_to_select,
- shape=None,
- read_csv_kwargs=None,
- ):
- """Load a compressed ARFF file using a given parser.
- Parameters
- ----------
- gzip_file : GzipFile instance
- The file compressed to be read.
- parser : {"pandas", "liac-arff"}
- The parser used to parse the ARFF file. "pandas" is recommended
- but only supports loading dense datasets.
- output_type : {"numpy", "sparse", "pandas"}
- The type of the arrays that will be returned. The possibilities ara:
- - `"numpy"`: both `X` and `y` will be NumPy arrays;
- - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
- - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
- pandas Series or DataFrame.
- openml_columns_info : dict
- The information provided by OpenML regarding the columns of the ARFF
- file.
- feature_names_to_select : list of str
- A list of the feature names to be selected.
- target_names_to_select : list of str
- A list of the target names to be selected.
- read_csv_kwargs : dict, default=None
- Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
- the default options.
- Returns
- -------
- X : {ndarray, sparse matrix, dataframe}
- The data matrix.
- y : {ndarray, dataframe, series}
- The target.
- frame : dataframe or None
- A dataframe containing both `X` and `y`. `None` if
- `output_array_type != "pandas"`.
- categories : list of str or None
- The names of the features that are categorical. `None` if
- `output_array_type == "pandas"`.
- """
- if parser == "liac-arff":
- return _liac_arff_parser(
- gzip_file,
- output_type,
- openml_columns_info,
- feature_names_to_select,
- target_names_to_select,
- shape,
- )
- elif parser == "pandas":
- return _pandas_arff_parser(
- gzip_file,
- output_type,
- openml_columns_info,
- feature_names_to_select,
- target_names_to_select,
- read_csv_kwargs,
- )
- else:
- raise ValueError(
- f"Unknown parser: '{parser}'. Should be 'liac-arff' or 'pandas'."
- )
|