| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643 |
- # coding=utf-8
- # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """GLUE processors and helpers"""
- import os
- import warnings
- from dataclasses import asdict
- from enum import Enum
- from typing import List, Optional, Union
- from ...tokenization_utils import PreTrainedTokenizer
- from ...utils import is_tf_available, logging
- from .utils import DataProcessor, InputExample, InputFeatures
- if is_tf_available():
- import tensorflow as tf
- logger = logging.get_logger(__name__)
- DEPRECATION_WARNING = (
- "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
- "library. You can have a look at this example script for pointers: "
- "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
- )
- def glue_convert_examples_to_features(
- examples: Union[List[InputExample], "tf.data.Dataset"],
- tokenizer: PreTrainedTokenizer,
- max_length: Optional[int] = None,
- task=None,
- label_list=None,
- output_mode=None,
- ):
- """
- Loads a data file into a list of `InputFeatures`
- Args:
- examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
- tokenizer: Instance of a tokenizer that will tokenize the examples
- max_length: Maximum example length. Defaults to the tokenizer's max_len
- task: GLUE task
- label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
- output_mode: String indicating the output mode. Either `regression` or `classification`
- Returns:
- If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
- features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
- can be fed to the model.
- """
- warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
- if is_tf_available() and isinstance(examples, tf.data.Dataset):
- if task is None:
- raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
- return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
- return _glue_convert_examples_to_features(
- examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
- )
- if is_tf_available():
- def _tf_glue_convert_examples_to_features(
- examples: tf.data.Dataset,
- tokenizer: PreTrainedTokenizer,
- task=str,
- max_length: Optional[int] = None,
- ) -> tf.data.Dataset:
- """
- Returns:
- A `tf.data.Dataset` containing the task-specific features.
- """
- processor = glue_processors[task]()
- examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
- features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
- label_type = tf.float32 if task == "sts-b" else tf.int64
- def gen():
- for ex in features:
- d = {k: v for k, v in asdict(ex).items() if v is not None}
- label = d.pop("label")
- yield (d, label)
- input_names = tokenizer.model_input_names
- return tf.data.Dataset.from_generator(
- gen,
- ({k: tf.int32 for k in input_names}, label_type),
- ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
- )
- def _glue_convert_examples_to_features(
- examples: List[InputExample],
- tokenizer: PreTrainedTokenizer,
- max_length: Optional[int] = None,
- task=None,
- label_list=None,
- output_mode=None,
- ):
- if max_length is None:
- max_length = tokenizer.model_max_length
- if task is not None:
- processor = glue_processors[task]()
- if label_list is None:
- label_list = processor.get_labels()
- logger.info(f"Using label list {label_list} for task {task}")
- if output_mode is None:
- output_mode = glue_output_modes[task]
- logger.info(f"Using output mode {output_mode} for task {task}")
- label_map = {label: i for i, label in enumerate(label_list)}
- def label_from_example(example: InputExample) -> Union[int, float, None]:
- if example.label is None:
- return None
- if output_mode == "classification":
- return label_map[example.label]
- elif output_mode == "regression":
- return float(example.label)
- raise KeyError(output_mode)
- labels = [label_from_example(example) for example in examples]
- batch_encoding = tokenizer(
- [(example.text_a, example.text_b) for example in examples],
- max_length=max_length,
- padding="max_length",
- truncation=True,
- )
- features = []
- for i in range(len(examples)):
- inputs = {k: batch_encoding[k][i] for k in batch_encoding}
- feature = InputFeatures(**inputs, label=labels[i])
- features.append(feature)
- for i, example in enumerate(examples[:5]):
- logger.info("*** Example ***")
- logger.info(f"guid: {example.guid}")
- logger.info(f"features: {features[i]}")
- return features
- class OutputMode(Enum):
- classification = "classification"
- regression = "regression"
- class MrpcProcessor(DataProcessor):
- """Processor for the MRPC data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["sentence1"].numpy().decode("utf-8"),
- tensor_dict["sentence2"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["0", "1"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{i}"
- text_a = line[3]
- text_b = line[4]
- label = None if set_type == "test" else line[0]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- class MnliProcessor(DataProcessor):
- """Processor for the MultiNLI data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["premise"].numpy().decode("utf-8"),
- tensor_dict["hypothesis"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
- def get_labels(self):
- """See base class."""
- return ["contradiction", "entailment", "neutral"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{line[0]}"
- text_a = line[8]
- text_b = line[9]
- label = None if set_type.startswith("test") else line[-1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- class MnliMismatchedProcessor(MnliProcessor):
- """Processor for the MultiNLI Mismatched data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")
- class ColaProcessor(DataProcessor):
- """Processor for the CoLA data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["sentence"].numpy().decode("utf-8"),
- None,
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["0", "1"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- test_mode = set_type == "test"
- if test_mode:
- lines = lines[1:]
- text_index = 1 if test_mode else 3
- examples = []
- for i, line in enumerate(lines):
- guid = f"{set_type}-{i}"
- text_a = line[text_index]
- label = None if test_mode else line[1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
- return examples
- class Sst2Processor(DataProcessor):
- """Processor for the SST-2 data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["sentence"].numpy().decode("utf-8"),
- None,
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["0", "1"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- text_index = 1 if set_type == "test" else 0
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{i}"
- text_a = line[text_index]
- label = None if set_type == "test" else line[1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
- return examples
- class StsbProcessor(DataProcessor):
- """Processor for the STS-B data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["sentence1"].numpy().decode("utf-8"),
- tensor_dict["sentence2"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return [None]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{line[0]}"
- text_a = line[7]
- text_b = line[8]
- label = None if set_type == "test" else line[-1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- class QqpProcessor(DataProcessor):
- """Processor for the QQP data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["question1"].numpy().decode("utf-8"),
- tensor_dict["question2"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["0", "1"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- test_mode = set_type == "test"
- q1_index = 1 if test_mode else 3
- q2_index = 2 if test_mode else 4
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{line[0]}"
- try:
- text_a = line[q1_index]
- text_b = line[q2_index]
- label = None if test_mode else line[5]
- except IndexError:
- continue
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- class QnliProcessor(DataProcessor):
- """Processor for the QNLI data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["question"].numpy().decode("utf-8"),
- tensor_dict["sentence"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["entailment", "not_entailment"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{line[0]}"
- text_a = line[1]
- text_b = line[2]
- label = None if set_type == "test" else line[-1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- class RteProcessor(DataProcessor):
- """Processor for the RTE data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["sentence1"].numpy().decode("utf-8"),
- tensor_dict["sentence2"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["entailment", "not_entailment"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{line[0]}"
- text_a = line[1]
- text_b = line[2]
- label = None if set_type == "test" else line[-1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- class WnliProcessor(DataProcessor):
- """Processor for the WNLI data set (GLUE version)."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
- def get_example_from_tensor_dict(self, tensor_dict):
- """See base class."""
- return InputExample(
- tensor_dict["idx"].numpy(),
- tensor_dict["sentence1"].numpy().decode("utf-8"),
- tensor_dict["sentence2"].numpy().decode("utf-8"),
- str(tensor_dict["label"].numpy()),
- )
- def get_train_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
- def get_dev_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
- def get_test_examples(self, data_dir):
- """See base class."""
- return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
- def get_labels(self):
- """See base class."""
- return ["0", "1"]
- def _create_examples(self, lines, set_type):
- """Creates examples for the training, dev and test sets."""
- examples = []
- for i, line in enumerate(lines):
- if i == 0:
- continue
- guid = f"{set_type}-{line[0]}"
- text_a = line[1]
- text_b = line[2]
- label = None if set_type == "test" else line[-1]
- examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
- return examples
- glue_tasks_num_labels = {
- "cola": 2,
- "mnli": 3,
- "mrpc": 2,
- "sst-2": 2,
- "sts-b": 1,
- "qqp": 2,
- "qnli": 2,
- "rte": 2,
- "wnli": 2,
- }
- glue_processors = {
- "cola": ColaProcessor,
- "mnli": MnliProcessor,
- "mnli-mm": MnliMismatchedProcessor,
- "mrpc": MrpcProcessor,
- "sst-2": Sst2Processor,
- "sts-b": StsbProcessor,
- "qqp": QqpProcessor,
- "qnli": QnliProcessor,
- "rte": RteProcessor,
- "wnli": WnliProcessor,
- }
- glue_output_modes = {
- "cola": "classification",
- "mnli": "classification",
- "mnli-mm": "classification",
- "mrpc": "classification",
- "sst-2": "classification",
- "sts-b": "regression",
- "qqp": "classification",
- "qnli": "classification",
- "rte": "classification",
- "wnli": "classification",
- }
|