XMD
/
Lightstar


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
							import textwrap
from io import BytesIO

import pytest

from sklearn.datasets._arff_parser import (
    _liac_arff_parser,
    _pandas_arff_parser,
    _post_process_frame,
    load_arff_from_gzip_file,
)


@pytest.mark.parametrize(
    "feature_names, target_names",
    [
        (
            [
                "col_int_as_integer",
                "col_int_as_numeric",
                "col_float_as_real",
                "col_float_as_numeric",
            ],
            ["col_categorical", "col_string"],
        ),
        (
            [
                "col_int_as_integer",
                "col_int_as_numeric",
                "col_float_as_real",
                "col_float_as_numeric",
            ],
            ["col_categorical"],
        ),
        (
            [
                "col_int_as_integer",
                "col_int_as_numeric",
                "col_float_as_real",
                "col_float_as_numeric",
            ],
            [],
        ),
    ],
)
def test_post_process_frame(feature_names, target_names):
    """Check the behaviour of the post-processing function for splitting a dataframe."""
    pd = pytest.importorskip("pandas")

    X_original = pd.DataFrame(
        {
            "col_int_as_integer": [1, 2, 3],
            "col_int_as_numeric": [1, 2, 3],
            "col_float_as_real": [1.0, 2.0, 3.0],
            "col_float_as_numeric": [1.0, 2.0, 3.0],
            "col_categorical": ["a", "b", "c"],
            "col_string": ["a", "b", "c"],
        }
    )

    X, y = _post_process_frame(X_original, feature_names, target_names)
    assert isinstance(X, pd.DataFrame)
    if len(target_names) >= 2:
        assert isinstance(y, pd.DataFrame)
    elif len(target_names) == 1:
        assert isinstance(y, pd.Series)
    else:
        assert y is None


def test_load_arff_from_gzip_file_error_parser():
    """An error will be raised if the parser is not known."""
    # None of the input parameters are required to be accurate since the check
    # of the parser will be carried out first.

    err_msg = "Unknown parser: 'xxx'. Should be 'liac-arff' or 'pandas'"
    with pytest.raises(ValueError, match=err_msg):
        load_arff_from_gzip_file("xxx", "xxx", "xxx", "xxx", "xxx", "xxx")


@pytest.mark.parametrize("parser_func", [_liac_arff_parser, _pandas_arff_parser])
def test_pandas_arff_parser_strip_single_quotes(parser_func):
    """Check that we properly strip single quotes from the data."""
    pd = pytest.importorskip("pandas")

    arff_file = BytesIO(textwrap.dedent("""
            @relation 'toy'
            @attribute 'cat_single_quote' {'A', 'B', 'C'}
            @attribute 'str_single_quote' string
            @attribute 'str_nested_quote' string
            @attribute 'class' numeric
            @data
            'A','some text','\"expect double quotes\"',0
            """).encode("utf-8"))

    columns_info = {
        "cat_single_quote": {
            "data_type": "nominal",
            "name": "cat_single_quote",
        },
        "str_single_quote": {
            "data_type": "string",
            "name": "str_single_quote",
        },
        "str_nested_quote": {
            "data_type": "string",
            "name": "str_nested_quote",
        },
        "class": {
            "data_type": "numeric",
            "name": "class",
        },
    }

    feature_names = [
        "cat_single_quote",
        "str_single_quote",
        "str_nested_quote",
    ]
    target_names = ["class"]

    # We don't strip single quotes for string columns with the pandas parser.
    expected_values = {
        "cat_single_quote": "A",
        "str_single_quote": (
            "some text" if parser_func is _liac_arff_parser else "'some text'"
        ),
        "str_nested_quote": (
            '"expect double quotes"'
            if parser_func is _liac_arff_parser
            else "'\"expect double quotes\"'"
        ),
        "class": 0,
    }

    _, _, frame, _ = parser_func(
        arff_file,
        output_arrays_type="pandas",
        openml_columns_info=columns_info,
        feature_names_to_select=feature_names,
        target_names_to_select=target_names,
    )

    assert frame.columns.tolist() == feature_names + target_names
    pd.testing.assert_series_equal(frame.iloc[0], pd.Series(expected_values, name=0))


@pytest.mark.parametrize("parser_func", [_liac_arff_parser, _pandas_arff_parser])
def test_pandas_arff_parser_strip_double_quotes(parser_func):
    """Check that we properly strip double quotes from the data."""
    pd = pytest.importorskip("pandas")

    arff_file = BytesIO(textwrap.dedent("""
            @relation 'toy'
            @attribute 'cat_double_quote' {"A", "B", "C"}
            @attribute 'str_double_quote' string
            @attribute 'str_nested_quote' string
            @attribute 'class' numeric
            @data
            "A","some text","\'expect double quotes\'",0
            """).encode("utf-8"))

    columns_info = {
        "cat_double_quote": {
            "data_type": "nominal",
            "name": "cat_double_quote",
        },
        "str_double_quote": {
            "data_type": "string",
            "name": "str_double_quote",
        },
        "str_nested_quote": {
            "data_type": "string",
            "name": "str_nested_quote",
        },
        "class": {
            "data_type": "numeric",
            "name": "class",
        },
    }

    feature_names = [
        "cat_double_quote",
        "str_double_quote",
        "str_nested_quote",
    ]
    target_names = ["class"]

    expected_values = {
        "cat_double_quote": "A",
        "str_double_quote": "some text",
        "str_nested_quote": "'expect double quotes'",
        "class": 0,
    }

    _, _, frame, _ = parser_func(
        arff_file,
        output_arrays_type="pandas",
        openml_columns_info=columns_info,
        feature_names_to_select=feature_names,
        target_names_to_select=target_names,
    )

    assert frame.columns.tolist() == feature_names + target_names
    pd.testing.assert_series_equal(frame.iloc[0], pd.Series(expected_values, name=0))


@pytest.mark.parametrize(
    "parser_func",
    [
        # internal quotes are not considered to follow the ARFF spec in LIAC ARFF
        pytest.param(_liac_arff_parser, marks=pytest.mark.xfail),
        _pandas_arff_parser,
    ],
)
def test_pandas_arff_parser_strip_no_quotes(parser_func):
    """Check that we properly parse with no quotes characters."""
    pd = pytest.importorskip("pandas")

    arff_file = BytesIO(textwrap.dedent("""
            @relation 'toy'
            @attribute 'cat_without_quote' {A, B, C}
            @attribute 'str_without_quote' string
            @attribute 'str_internal_quote' string
            @attribute 'class' numeric
            @data
            A,some text,'internal' quote,0
            """).encode("utf-8"))

    columns_info = {
        "cat_without_quote": {
            "data_type": "nominal",
            "name": "cat_without_quote",
        },
        "str_without_quote": {
            "data_type": "string",
            "name": "str_without_quote",
        },
        "str_internal_quote": {
            "data_type": "string",
            "name": "str_internal_quote",
        },
        "class": {
            "data_type": "numeric",
            "name": "class",
        },
    }

    feature_names = [
        "cat_without_quote",
        "str_without_quote",
        "str_internal_quote",
    ]
    target_names = ["class"]

    expected_values = {
        "cat_without_quote": "A",
        "str_without_quote": "some text",
        "str_internal_quote": "'internal' quote",
        "class": 0,
    }

    _, _, frame, _ = parser_func(
        arff_file,
        output_arrays_type="pandas",
        openml_columns_info=columns_info,
        feature_names_to_select=feature_names,
        target_names_to_select=target_names,
    )

    assert frame.columns.tolist() == feature_names + target_names
    pd.testing.assert_series_equal(frame.iloc[0], pd.Series(expected_values, name=0))