dynamic_module_utils.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
  1. # coding=utf-8
  2. # Copyright 2021 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Utilities to dynamically load objects from the Hub."""
  16. import filecmp
  17. import hashlib
  18. import importlib
  19. import importlib.util
  20. import os
  21. import re
  22. import shutil
  23. import signal
  24. import sys
  25. import threading
  26. import typing
  27. import warnings
  28. from pathlib import Path
  29. from types import ModuleType
  30. from typing import Any, Dict, List, Optional, Union
  31. from huggingface_hub import try_to_load_from_cache
  32. from .utils import (
  33. HF_MODULES_CACHE,
  34. TRANSFORMERS_DYNAMIC_MODULE_NAME,
  35. cached_file,
  36. extract_commit_hash,
  37. is_offline_mode,
  38. logging,
  39. )
  40. logger = logging.get_logger(__name__) # pylint: disable=invalid-name
  41. _HF_REMOTE_CODE_LOCK = threading.Lock()
  42. def init_hf_modules():
  43. """
  44. Creates the cache directory for modules with an init, and adds it to the Python path.
  45. """
  46. # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
  47. if HF_MODULES_CACHE in sys.path:
  48. return
  49. sys.path.append(HF_MODULES_CACHE)
  50. os.makedirs(HF_MODULES_CACHE, exist_ok=True)
  51. init_path = Path(HF_MODULES_CACHE) / "__init__.py"
  52. if not init_path.exists():
  53. init_path.touch()
  54. importlib.invalidate_caches()
  55. def create_dynamic_module(name: Union[str, os.PathLike]) -> None:
  56. """
  57. Creates a dynamic module in the cache directory for modules.
  58. Args:
  59. name (`str` or `os.PathLike`):
  60. The name of the dynamic module to create.
  61. """
  62. init_hf_modules()
  63. dynamic_module_path = (Path(HF_MODULES_CACHE) / name).resolve()
  64. # If the parent module does not exist yet, recursively create it.
  65. if not dynamic_module_path.parent.exists():
  66. create_dynamic_module(dynamic_module_path.parent)
  67. os.makedirs(dynamic_module_path, exist_ok=True)
  68. init_path = dynamic_module_path / "__init__.py"
  69. if not init_path.exists():
  70. init_path.touch()
  71. # It is extremely important to invalidate the cache when we change stuff in those modules, or users end up
  72. # with errors about module that do not exist. Same for all other `invalidate_caches` in this file.
  73. importlib.invalidate_caches()
  74. def get_relative_imports(module_file: Union[str, os.PathLike]) -> List[str]:
  75. """
  76. Get the list of modules that are relatively imported in a module file.
  77. Args:
  78. module_file (`str` or `os.PathLike`): The module file to inspect.
  79. Returns:
  80. `List[str]`: The list of relative imports in the module.
  81. """
  82. with open(module_file, "r", encoding="utf-8") as f:
  83. content = f.read()
  84. # Imports of the form `import .xxx`
  85. relative_imports = re.findall(r"^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
  86. # Imports of the form `from .xxx import yyy`
  87. relative_imports += re.findall(r"^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
  88. # Unique-ify
  89. return list(set(relative_imports))
  90. def get_relative_import_files(module_file: Union[str, os.PathLike]) -> List[str]:
  91. """
  92. Get the list of all files that are needed for a given module. Note that this function recurses through the relative
  93. imports (if a imports b and b imports c, it will return module files for b and c).
  94. Args:
  95. module_file (`str` or `os.PathLike`): The module file to inspect.
  96. Returns:
  97. `List[str]`: The list of all relative imports a given module needs (recursively), which will give us the list
  98. of module files a given module needs.
  99. """
  100. no_change = False
  101. files_to_check = [module_file]
  102. all_relative_imports = []
  103. # Let's recurse through all relative imports
  104. while not no_change:
  105. new_imports = []
  106. for f in files_to_check:
  107. new_imports.extend(get_relative_imports(f))
  108. module_path = Path(module_file).parent
  109. new_import_files = [str(module_path / m) for m in new_imports]
  110. new_import_files = [f for f in new_import_files if f not in all_relative_imports]
  111. files_to_check = [f"{f}.py" for f in new_import_files]
  112. no_change = len(new_import_files) == 0
  113. all_relative_imports.extend(files_to_check)
  114. return all_relative_imports
  115. def get_imports(filename: Union[str, os.PathLike]) -> List[str]:
  116. """
  117. Extracts all the libraries (not relative imports this time) that are imported in a file.
  118. Args:
  119. filename (`str` or `os.PathLike`): The module file to inspect.
  120. Returns:
  121. `List[str]`: The list of all packages required to use the input module.
  122. """
  123. with open(filename, "r", encoding="utf-8") as f:
  124. content = f.read()
  125. # filter out try/except block so in custom code we can have try/except imports
  126. content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL)
  127. # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment
  128. content = re.sub(
  129. r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", content, flags=re.MULTILINE
  130. )
  131. # Imports of the form `import xxx`
  132. imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
  133. # Imports of the form `from xxx import yyy`
  134. imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
  135. # Only keep the top-level module
  136. imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
  137. return list(set(imports))
  138. def check_imports(filename: Union[str, os.PathLike]) -> List[str]:
  139. """
  140. Check if the current Python environment contains all the libraries that are imported in a file. Will raise if a
  141. library is missing.
  142. Args:
  143. filename (`str` or `os.PathLike`): The module file to check.
  144. Returns:
  145. `List[str]`: The list of relative imports in the file.
  146. """
  147. imports = get_imports(filename)
  148. missing_packages = []
  149. for imp in imports:
  150. try:
  151. importlib.import_module(imp)
  152. except ImportError as exception:
  153. logger.warning(f"Encountered exception while importing {imp}: {exception}")
  154. # Some packages can fail with an ImportError because of a dependency issue.
  155. # This check avoids hiding such errors.
  156. # See https://github.com/huggingface/transformers/issues/33604
  157. if "No module named" in str(exception):
  158. missing_packages.append(imp)
  159. else:
  160. raise
  161. if len(missing_packages) > 0:
  162. raise ImportError(
  163. "This modeling file requires the following packages that were not found in your environment: "
  164. f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
  165. )
  166. return get_relative_imports(filename)
  167. def get_class_in_module(
  168. class_name: str,
  169. module_path: Union[str, os.PathLike],
  170. *,
  171. force_reload: bool = False,
  172. ) -> typing.Type:
  173. """
  174. Import a module on the cache directory for modules and extract a class from it.
  175. Args:
  176. class_name (`str`): The name of the class to import.
  177. module_path (`str` or `os.PathLike`): The path to the module to import.
  178. force_reload (`bool`, *optional*, defaults to `False`):
  179. Whether to reload the dynamic module from file if it already exists in `sys.modules`.
  180. Otherwise, the module is only reloaded if the file has changed.
  181. Returns:
  182. `typing.Type`: The class looked for.
  183. """
  184. name = os.path.normpath(module_path)
  185. if name.endswith(".py"):
  186. name = name[:-3]
  187. name = name.replace(os.path.sep, ".")
  188. module_file: Path = Path(HF_MODULES_CACHE) / module_path
  189. with _HF_REMOTE_CODE_LOCK:
  190. if force_reload:
  191. sys.modules.pop(name, None)
  192. importlib.invalidate_caches()
  193. cached_module: Optional[ModuleType] = sys.modules.get(name)
  194. module_spec = importlib.util.spec_from_file_location(name, location=module_file)
  195. # Hash the module file and all its relative imports to check if we need to reload it
  196. module_files: List[Path] = [module_file] + sorted(map(Path, get_relative_import_files(module_file)))
  197. module_hash: str = hashlib.sha256(b"".join(bytes(f) + f.read_bytes() for f in module_files)).hexdigest()
  198. module: ModuleType
  199. if cached_module is None:
  200. module = importlib.util.module_from_spec(module_spec)
  201. # insert it into sys.modules before any loading begins
  202. sys.modules[name] = module
  203. else:
  204. module = cached_module
  205. # reload in both cases, unless the module is already imported and the hash hits
  206. if getattr(module, "__transformers_module_hash__", "") != module_hash:
  207. module_spec.loader.exec_module(module)
  208. module.__transformers_module_hash__ = module_hash
  209. return getattr(module, class_name)
  210. def get_cached_module_file(
  211. pretrained_model_name_or_path: Union[str, os.PathLike],
  212. module_file: str,
  213. cache_dir: Optional[Union[str, os.PathLike]] = None,
  214. force_download: bool = False,
  215. resume_download: Optional[bool] = None,
  216. proxies: Optional[Dict[str, str]] = None,
  217. token: Optional[Union[bool, str]] = None,
  218. revision: Optional[str] = None,
  219. local_files_only: bool = False,
  220. repo_type: Optional[str] = None,
  221. _commit_hash: Optional[str] = None,
  222. **deprecated_kwargs,
  223. ) -> str:
  224. """
  225. Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
  226. Transformers module.
  227. Args:
  228. pretrained_model_name_or_path (`str` or `os.PathLike`):
  229. This can be either:
  230. - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
  231. huggingface.co.
  232. - a path to a *directory* containing a configuration file saved using the
  233. [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
  234. module_file (`str`):
  235. The name of the module file containing the class to look for.
  236. cache_dir (`str` or `os.PathLike`, *optional*):
  237. Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
  238. cache should not be used.
  239. force_download (`bool`, *optional*, defaults to `False`):
  240. Whether or not to force to (re-)download the configuration files and override the cached versions if they
  241. exist.
  242. resume_download:
  243. Deprecated and ignored. All downloads are now resumed by default when possible.
  244. Will be removed in v5 of Transformers.
  245. proxies (`Dict[str, str]`, *optional*):
  246. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  247. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  248. token (`str` or *bool*, *optional*):
  249. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  250. when running `huggingface-cli login` (stored in `~/.huggingface`).
  251. revision (`str`, *optional*, defaults to `"main"`):
  252. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  253. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  254. identifier allowed by git.
  255. local_files_only (`bool`, *optional*, defaults to `False`):
  256. If `True`, will only try to load the tokenizer configuration from local files.
  257. repo_type (`str`, *optional*):
  258. Specify the repo type (useful when downloading from a space for instance).
  259. <Tip>
  260. Passing `token=True` is required when you want to use a private model.
  261. </Tip>
  262. Returns:
  263. `str`: The path to the module inside the cache.
  264. """
  265. use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
  266. if use_auth_token is not None:
  267. warnings.warn(
  268. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  269. FutureWarning,
  270. )
  271. if token is not None:
  272. raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
  273. token = use_auth_token
  274. if is_offline_mode() and not local_files_only:
  275. logger.info("Offline mode: forcing local_files_only=True")
  276. local_files_only = True
  277. # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
  278. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  279. is_local = os.path.isdir(pretrained_model_name_or_path)
  280. if is_local:
  281. submodule = os.path.basename(pretrained_model_name_or_path)
  282. else:
  283. submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
  284. cached_module = try_to_load_from_cache(
  285. pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
  286. )
  287. new_files = []
  288. try:
  289. # Load from URL or cache if already cached
  290. resolved_module_file = cached_file(
  291. pretrained_model_name_or_path,
  292. module_file,
  293. cache_dir=cache_dir,
  294. force_download=force_download,
  295. proxies=proxies,
  296. resume_download=resume_download,
  297. local_files_only=local_files_only,
  298. token=token,
  299. revision=revision,
  300. repo_type=repo_type,
  301. _commit_hash=_commit_hash,
  302. )
  303. if not is_local and cached_module != resolved_module_file:
  304. new_files.append(module_file)
  305. except EnvironmentError:
  306. logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
  307. raise
  308. # Check we have all the requirements in our environment
  309. modules_needed = check_imports(resolved_module_file)
  310. # Now we move the module inside our cached dynamic modules.
  311. full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
  312. create_dynamic_module(full_submodule)
  313. submodule_path = Path(HF_MODULES_CACHE) / full_submodule
  314. if submodule == os.path.basename(pretrained_model_name_or_path):
  315. # We copy local files to avoid putting too many folders in sys.path. This copy is done when the file is new or
  316. # has changed since last copy.
  317. if not (submodule_path / module_file).exists() or not filecmp.cmp(
  318. resolved_module_file, str(submodule_path / module_file)
  319. ):
  320. shutil.copy(resolved_module_file, submodule_path / module_file)
  321. importlib.invalidate_caches()
  322. for module_needed in modules_needed:
  323. module_needed = f"{module_needed}.py"
  324. module_needed_file = os.path.join(pretrained_model_name_or_path, module_needed)
  325. if not (submodule_path / module_needed).exists() or not filecmp.cmp(
  326. module_needed_file, str(submodule_path / module_needed)
  327. ):
  328. shutil.copy(module_needed_file, submodule_path / module_needed)
  329. importlib.invalidate_caches()
  330. else:
  331. # Get the commit hash
  332. commit_hash = extract_commit_hash(resolved_module_file, _commit_hash)
  333. # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
  334. # benefit of versioning.
  335. submodule_path = submodule_path / commit_hash
  336. full_submodule = full_submodule + os.path.sep + commit_hash
  337. create_dynamic_module(full_submodule)
  338. if not (submodule_path / module_file).exists():
  339. shutil.copy(resolved_module_file, submodule_path / module_file)
  340. importlib.invalidate_caches()
  341. # Make sure we also have every file with relative
  342. for module_needed in modules_needed:
  343. if not (submodule_path / f"{module_needed}.py").exists():
  344. get_cached_module_file(
  345. pretrained_model_name_or_path,
  346. f"{module_needed}.py",
  347. cache_dir=cache_dir,
  348. force_download=force_download,
  349. resume_download=resume_download,
  350. proxies=proxies,
  351. token=token,
  352. revision=revision,
  353. local_files_only=local_files_only,
  354. _commit_hash=commit_hash,
  355. )
  356. new_files.append(f"{module_needed}.py")
  357. if len(new_files) > 0 and revision is None:
  358. new_files = "\n".join([f"- {f}" for f in new_files])
  359. repo_type_str = "" if repo_type is None else f"{repo_type}s/"
  360. url = f"https://huggingface.co/{repo_type_str}{pretrained_model_name_or_path}"
  361. logger.warning(
  362. f"A new version of the following files was downloaded from {url}:\n{new_files}"
  363. "\n. Make sure to double-check they do not contain any added malicious code. To avoid downloading new "
  364. "versions of the code file, you can pin a revision."
  365. )
  366. return os.path.join(full_submodule, module_file)
  367. def get_class_from_dynamic_module(
  368. class_reference: str,
  369. pretrained_model_name_or_path: Union[str, os.PathLike],
  370. cache_dir: Optional[Union[str, os.PathLike]] = None,
  371. force_download: bool = False,
  372. resume_download: Optional[bool] = None,
  373. proxies: Optional[Dict[str, str]] = None,
  374. token: Optional[Union[bool, str]] = None,
  375. revision: Optional[str] = None,
  376. local_files_only: bool = False,
  377. repo_type: Optional[str] = None,
  378. code_revision: Optional[str] = None,
  379. **kwargs,
  380. ) -> typing.Type:
  381. """
  382. Extracts a class from a module file, present in the local folder or repository of a model.
  383. <Tip warning={true}>
  384. Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
  385. therefore only be called on trusted repos.
  386. </Tip>
  387. Args:
  388. class_reference (`str`):
  389. The full name of the class to load, including its module and optionally its repo.
  390. pretrained_model_name_or_path (`str` or `os.PathLike`):
  391. This can be either:
  392. - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
  393. huggingface.co.
  394. - a path to a *directory* containing a configuration file saved using the
  395. [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
  396. This is used when `class_reference` does not specify another repo.
  397. module_file (`str`):
  398. The name of the module file containing the class to look for.
  399. class_name (`str`):
  400. The name of the class to import in the module.
  401. cache_dir (`str` or `os.PathLike`, *optional*):
  402. Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
  403. cache should not be used.
  404. force_download (`bool`, *optional*, defaults to `False`):
  405. Whether or not to force to (re-)download the configuration files and override the cached versions if they
  406. exist.
  407. resume_download:
  408. Deprecated and ignored. All downloads are now resumed by default when possible.
  409. Will be removed in v5 of Transformers.
  410. proxies (`Dict[str, str]`, *optional*):
  411. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  412. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  413. token (`str` or `bool`, *optional*):
  414. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  415. when running `huggingface-cli login` (stored in `~/.huggingface`).
  416. revision (`str`, *optional*, defaults to `"main"`):
  417. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  418. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  419. identifier allowed by git.
  420. local_files_only (`bool`, *optional*, defaults to `False`):
  421. If `True`, will only try to load the tokenizer configuration from local files.
  422. repo_type (`str`, *optional*):
  423. Specify the repo type (useful when downloading from a space for instance).
  424. code_revision (`str`, *optional*, defaults to `"main"`):
  425. The specific revision to use for the code on the Hub, if the code leaves in a different repository than the
  426. rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based system for
  427. storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
  428. <Tip>
  429. Passing `token=True` is required when you want to use a private model.
  430. </Tip>
  431. Returns:
  432. `typing.Type`: The class, dynamically imported from the module.
  433. Examples:
  434. ```python
  435. # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
  436. # module.
  437. cls = get_class_from_dynamic_module("modeling.MyBertModel", "sgugger/my-bert-model")
  438. # Download module `modeling.py` from a given repo and cache then extract the class `MyBertModel` from this
  439. # module.
  440. cls = get_class_from_dynamic_module("sgugger/my-bert-model--modeling.MyBertModel", "sgugger/another-bert-model")
  441. ```"""
  442. use_auth_token = kwargs.pop("use_auth_token", None)
  443. if use_auth_token is not None:
  444. warnings.warn(
  445. "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
  446. FutureWarning,
  447. )
  448. if token is not None:
  449. raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
  450. token = use_auth_token
  451. # Catch the name of the repo if it's specified in `class_reference`
  452. if "--" in class_reference:
  453. repo_id, class_reference = class_reference.split("--")
  454. else:
  455. repo_id = pretrained_model_name_or_path
  456. module_file, class_name = class_reference.split(".")
  457. if code_revision is None and pretrained_model_name_or_path == repo_id:
  458. code_revision = revision
  459. # And lastly we get the class inside our newly created module
  460. final_module = get_cached_module_file(
  461. repo_id,
  462. module_file + ".py",
  463. cache_dir=cache_dir,
  464. force_download=force_download,
  465. resume_download=resume_download,
  466. proxies=proxies,
  467. token=token,
  468. revision=code_revision,
  469. local_files_only=local_files_only,
  470. repo_type=repo_type,
  471. )
  472. return get_class_in_module(class_name, final_module, force_reload=force_download)
  473. def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Optional[Dict] = None) -> List[str]:
  474. """
  475. Save the modeling files corresponding to a custom model/configuration/tokenizer etc. in a given folder. Optionally
  476. adds the proper fields in a config.
  477. Args:
  478. obj (`Any`): The object for which to save the module files.
  479. folder (`str` or `os.PathLike`): The folder where to save.
  480. config (`PretrainedConfig` or dictionary, `optional`):
  481. A config in which to register the auto_map corresponding to this custom object.
  482. Returns:
  483. `List[str]`: The list of files saved.
  484. """
  485. if obj.__module__ == "__main__":
  486. logger.warning(
  487. f"We can't save the code defining {obj} in {folder} as it's been defined in __main__. You should put "
  488. "this code in a separate module so we can include it in the saved folder and make it easier to share via "
  489. "the Hub."
  490. )
  491. return
  492. def _set_auto_map_in_config(_config):
  493. module_name = obj.__class__.__module__
  494. last_module = module_name.split(".")[-1]
  495. full_name = f"{last_module}.{obj.__class__.__name__}"
  496. # Special handling for tokenizers
  497. if "Tokenizer" in full_name:
  498. slow_tokenizer_class = None
  499. fast_tokenizer_class = None
  500. if obj.__class__.__name__.endswith("Fast"):
  501. # Fast tokenizer: we have the fast tokenizer class and we may have the slow one has an attribute.
  502. fast_tokenizer_class = f"{last_module}.{obj.__class__.__name__}"
  503. if getattr(obj, "slow_tokenizer_class", None) is not None:
  504. slow_tokenizer = getattr(obj, "slow_tokenizer_class")
  505. slow_tok_module_name = slow_tokenizer.__module__
  506. last_slow_tok_module = slow_tok_module_name.split(".")[-1]
  507. slow_tokenizer_class = f"{last_slow_tok_module}.{slow_tokenizer.__name__}"
  508. else:
  509. # Slow tokenizer: no way to have the fast class
  510. slow_tokenizer_class = f"{last_module}.{obj.__class__.__name__}"
  511. full_name = (slow_tokenizer_class, fast_tokenizer_class)
  512. if isinstance(_config, dict):
  513. auto_map = _config.get("auto_map", {})
  514. auto_map[obj._auto_class] = full_name
  515. _config["auto_map"] = auto_map
  516. elif getattr(_config, "auto_map", None) is not None:
  517. _config.auto_map[obj._auto_class] = full_name
  518. else:
  519. _config.auto_map = {obj._auto_class: full_name}
  520. # Add object class to the config auto_map
  521. if isinstance(config, (list, tuple)):
  522. for cfg in config:
  523. _set_auto_map_in_config(cfg)
  524. elif config is not None:
  525. _set_auto_map_in_config(config)
  526. result = []
  527. # Copy module file to the output folder.
  528. object_file = sys.modules[obj.__module__].__file__
  529. dest_file = Path(folder) / (Path(object_file).name)
  530. shutil.copy(object_file, dest_file)
  531. result.append(dest_file)
  532. # Gather all relative imports recursively and make sure they are copied as well.
  533. for needed_file in get_relative_import_files(object_file):
  534. dest_file = Path(folder) / (Path(needed_file).name)
  535. shutil.copy(needed_file, dest_file)
  536. result.append(dest_file)
  537. return result
  538. def _raise_timeout_error(signum, frame):
  539. raise ValueError(
  540. "Loading this model requires you to execute custom code contained in the model repository on your local "
  541. "machine. Please set the option `trust_remote_code=True` to permit loading of this model."
  542. )
  543. TIME_OUT_REMOTE_CODE = 15
  544. def resolve_trust_remote_code(trust_remote_code, model_name, has_local_code, has_remote_code):
  545. if trust_remote_code is None:
  546. if has_local_code:
  547. trust_remote_code = False
  548. elif has_remote_code and TIME_OUT_REMOTE_CODE > 0:
  549. prev_sig_handler = None
  550. try:
  551. prev_sig_handler = signal.signal(signal.SIGALRM, _raise_timeout_error)
  552. signal.alarm(TIME_OUT_REMOTE_CODE)
  553. while trust_remote_code is None:
  554. answer = input(
  555. f"The repository for {model_name} contains custom code which must be executed to correctly "
  556. f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
  557. f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
  558. f"Do you wish to run the custom code? [y/N] "
  559. )
  560. if answer.lower() in ["yes", "y", "1"]:
  561. trust_remote_code = True
  562. elif answer.lower() in ["no", "n", "0", ""]:
  563. trust_remote_code = False
  564. signal.alarm(0)
  565. except Exception:
  566. # OS which does not support signal.SIGALRM
  567. raise ValueError(
  568. f"The repository for {model_name} contains custom code which must be executed to correctly "
  569. f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
  570. f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
  571. )
  572. finally:
  573. if prev_sig_handler is not None:
  574. signal.signal(signal.SIGALRM, prev_sig_handler)
  575. signal.alarm(0)
  576. elif has_remote_code:
  577. # For the CI which puts the timeout at 0
  578. _raise_timeout_error(None, None)
  579. if has_remote_code and not has_local_code and not trust_remote_code:
  580. raise ValueError(
  581. f"Loading {model_name} requires you to execute the configuration file in that"
  582. " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
  583. " set the option `trust_remote_code=True` to remove this error."
  584. )
  585. return trust_remote_code