| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- # coding=utf-8
- # Copyright 2022 The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from typing import Dict, Iterable, Optional, Union
- import numpy as np
- from .image_processing_base import BatchFeature, ImageProcessingMixin
- from .image_transforms import center_crop, normalize, rescale
- from .image_utils import ChannelDimension
- from .utils import logging
- logger = logging.get_logger(__name__)
- INIT_SERVICE_KWARGS = [
- "processor_class",
- "image_processor_type",
- ]
- class BaseImageProcessor(ImageProcessingMixin):
- def __init__(self, **kwargs):
- super().__init__(**kwargs)
- def __call__(self, images, **kwargs) -> BatchFeature:
- """Preprocess an image or a batch of images."""
- return self.preprocess(images, **kwargs)
- def preprocess(self, images, **kwargs) -> BatchFeature:
- raise NotImplementedError("Each image processor must implement its own preprocess method")
- def rescale(
- self,
- image: np.ndarray,
- scale: float,
- data_format: Optional[Union[str, ChannelDimension]] = None,
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
- **kwargs,
- ) -> np.ndarray:
- """
- Rescale an image by a scale factor. image = image * scale.
- Args:
- image (`np.ndarray`):
- Image to rescale.
- scale (`float`):
- The scaling factor to rescale pixel values by.
- data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the output image. If unset, the channel dimension format of the input
- image is used. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- input_data_format (`ChannelDimension` or `str`, *optional*):
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
- from the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Returns:
- `np.ndarray`: The rescaled image.
- """
- return rescale(image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs)
- def normalize(
- self,
- image: np.ndarray,
- mean: Union[float, Iterable[float]],
- std: Union[float, Iterable[float]],
- data_format: Optional[Union[str, ChannelDimension]] = None,
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
- **kwargs,
- ) -> np.ndarray:
- """
- Normalize an image. image = (image - image_mean) / image_std.
- Args:
- image (`np.ndarray`):
- Image to normalize.
- mean (`float` or `Iterable[float]`):
- Image mean to use for normalization.
- std (`float` or `Iterable[float]`):
- Image standard deviation to use for normalization.
- data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the output image. If unset, the channel dimension format of the input
- image is used. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- input_data_format (`ChannelDimension` or `str`, *optional*):
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
- from the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Returns:
- `np.ndarray`: The normalized image.
- """
- return normalize(
- image, mean=mean, std=std, data_format=data_format, input_data_format=input_data_format, **kwargs
- )
- def center_crop(
- self,
- image: np.ndarray,
- size: Dict[str, int],
- data_format: Optional[Union[str, ChannelDimension]] = None,
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
- **kwargs,
- ) -> np.ndarray:
- """
- Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
- any edge, the image is padded with 0's and then center cropped.
- Args:
- image (`np.ndarray`):
- Image to center crop.
- size (`Dict[str, int]`):
- Size of the output image.
- data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the output image. If unset, the channel dimension format of the input
- image is used. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- input_data_format (`ChannelDimension` or `str`, *optional*):
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
- from the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- """
- size = get_size_dict(size)
- if "height" not in size or "width" not in size:
- raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
- return center_crop(
- image,
- size=(size["height"], size["width"]),
- data_format=data_format,
- input_data_format=input_data_format,
- **kwargs,
- )
- def to_dict(self):
- encoder_dict = super().to_dict()
- encoder_dict.pop("_valid_processor_keys", None)
- return encoder_dict
- VALID_SIZE_DICT_KEYS = (
- {"height", "width"},
- {"shortest_edge"},
- {"shortest_edge", "longest_edge"},
- {"longest_edge"},
- {"max_height", "max_width"},
- )
- def is_valid_size_dict(size_dict):
- if not isinstance(size_dict, dict):
- return False
- size_dict_keys = set(size_dict.keys())
- for allowed_keys in VALID_SIZE_DICT_KEYS:
- if size_dict_keys == allowed_keys:
- return True
- return False
- def convert_to_size_dict(
- size, max_size: Optional[int] = None, default_to_square: bool = True, height_width_order: bool = True
- ):
- # By default, if size is an int we assume it represents a tuple of (size, size).
- if isinstance(size, int) and default_to_square:
- if max_size is not None:
- raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size")
- return {"height": size, "width": size}
- # In other configs, if size is an int and default_to_square is False, size represents the length of
- # the shortest edge after resizing.
- elif isinstance(size, int) and not default_to_square:
- size_dict = {"shortest_edge": size}
- if max_size is not None:
- size_dict["longest_edge"] = max_size
- return size_dict
- # Otherwise, if size is a tuple it's either (height, width) or (width, height)
- elif isinstance(size, (tuple, list)) and height_width_order:
- return {"height": size[0], "width": size[1]}
- elif isinstance(size, (tuple, list)) and not height_width_order:
- return {"height": size[1], "width": size[0]}
- elif size is None and max_size is not None:
- if default_to_square:
- raise ValueError("Cannot specify both default_to_square=True and max_size")
- return {"longest_edge": max_size}
- raise ValueError(f"Could not convert size input to size dict: {size}")
- def get_size_dict(
- size: Union[int, Iterable[int], Dict[str, int]] = None,
- max_size: Optional[int] = None,
- height_width_order: bool = True,
- default_to_square: bool = True,
- param_name="size",
- ) -> dict:
- """
- Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
- compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
- width) or (width, height) format.
- - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width":
- size[0]}` if `height_width_order` is `False`.
- - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`.
- - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size`
- is set, it is added to the dict as `{"longest_edge": max_size}`.
- Args:
- size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*):
- The `size` parameter to be cast into a size dictionary.
- max_size (`Optional[int]`, *optional*):
- The `max_size` parameter to be cast into a size dictionary.
- height_width_order (`bool`, *optional*, defaults to `True`):
- If `size` is a tuple, whether it's in (height, width) or (width, height) order.
- default_to_square (`bool`, *optional*, defaults to `True`):
- If `size` is an int, whether to default to a square image or not.
- """
- if not isinstance(size, dict):
- size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order)
- logger.info(
- f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
- f" Converted to {size_dict}.",
- )
- else:
- size_dict = size
- if not is_valid_size_dict(size_dict):
- raise ValueError(
- f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}"
- )
- return size_dict
- def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
- """
- Selects the best resolution from a list of possible resolutions based on the original size.
- This is done by calculating the effective and wasted resolution for each possible resolution.
- The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
- Args:
- original_size (tuple):
- The original size of the image in the format (height, width).
- possible_resolutions (list):
- A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
- Returns:
- tuple: The best fit resolution in the format (height, width).
- """
- original_height, original_width = original_size
- best_fit = None
- max_effective_resolution = 0
- min_wasted_resolution = float("inf")
- for height, width in possible_resolutions:
- scale = min(width / original_width, height / original_height)
- downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
- effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
- wasted_resolution = (width * height) - effective_resolution
- if effective_resolution > max_effective_resolution or (
- effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
- ):
- max_effective_resolution = effective_resolution
- min_wasted_resolution = wasted_resolution
- best_fit = (height, width)
- return best_fit
|