sources.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. from __future__ import annotations
  2. import logging
  3. import mimetypes
  4. import os
  5. from collections import defaultdict
  6. from collections.abc import Iterable
  7. from typing import Callable
  8. from pip._vendor.packaging.utils import (
  9. InvalidSdistFilename,
  10. InvalidWheelFilename,
  11. canonicalize_name,
  12. parse_sdist_filename,
  13. parse_wheel_filename,
  14. )
  15. from pip._internal.models.candidate import InstallationCandidate
  16. from pip._internal.models.link import Link
  17. from pip._internal.utils.urls import path_to_url, url_to_path
  18. from pip._internal.vcs import is_url
  19. logger = logging.getLogger(__name__)
  20. FoundCandidates = Iterable[InstallationCandidate]
  21. FoundLinks = Iterable[Link]
  22. CandidatesFromPage = Callable[[Link], Iterable[InstallationCandidate]]
  23. PageValidator = Callable[[Link], bool]
  24. class LinkSource:
  25. @property
  26. def link(self) -> Link | None:
  27. """Returns the underlying link, if there's one."""
  28. raise NotImplementedError()
  29. def page_candidates(self) -> FoundCandidates:
  30. """Candidates found by parsing an archive listing HTML file."""
  31. raise NotImplementedError()
  32. def file_links(self) -> FoundLinks:
  33. """Links found by specifying archives directly."""
  34. raise NotImplementedError()
  35. def _is_html_file(file_url: str) -> bool:
  36. return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"
  37. class _FlatDirectoryToUrls:
  38. """Scans directory and caches results"""
  39. def __init__(self, path: str) -> None:
  40. self._path = path
  41. self._page_candidates: list[str] = []
  42. self._project_name_to_urls: dict[str, list[str]] = defaultdict(list)
  43. self._scanned_directory = False
  44. def _scan_directory(self) -> None:
  45. """Scans directory once and populates both page_candidates
  46. and project_name_to_urls at the same time
  47. """
  48. for entry in os.scandir(self._path):
  49. url = path_to_url(entry.path)
  50. if _is_html_file(url):
  51. self._page_candidates.append(url)
  52. continue
  53. # File must have a valid wheel or sdist name,
  54. # otherwise not worth considering as a package
  55. try:
  56. project_filename = parse_wheel_filename(entry.name)[0]
  57. except InvalidWheelFilename:
  58. try:
  59. project_filename = parse_sdist_filename(entry.name)[0]
  60. except InvalidSdistFilename:
  61. continue
  62. self._project_name_to_urls[project_filename].append(url)
  63. self._scanned_directory = True
  64. @property
  65. def page_candidates(self) -> list[str]:
  66. if not self._scanned_directory:
  67. self._scan_directory()
  68. return self._page_candidates
  69. @property
  70. def project_name_to_urls(self) -> dict[str, list[str]]:
  71. if not self._scanned_directory:
  72. self._scan_directory()
  73. return self._project_name_to_urls
  74. class _FlatDirectorySource(LinkSource):
  75. """Link source specified by ``--find-links=<path-to-dir>``.
  76. This looks the content of the directory, and returns:
  77. * ``page_candidates``: Links listed on each HTML file in the directory.
  78. * ``file_candidates``: Archives in the directory.
  79. """
  80. _paths_to_urls: dict[str, _FlatDirectoryToUrls] = {}
  81. def __init__(
  82. self,
  83. candidates_from_page: CandidatesFromPage,
  84. path: str,
  85. project_name: str,
  86. ) -> None:
  87. self._candidates_from_page = candidates_from_page
  88. self._project_name = canonicalize_name(project_name)
  89. # Get existing instance of _FlatDirectoryToUrls if it exists
  90. if path in self._paths_to_urls:
  91. self._path_to_urls = self._paths_to_urls[path]
  92. else:
  93. self._path_to_urls = _FlatDirectoryToUrls(path=path)
  94. self._paths_to_urls[path] = self._path_to_urls
  95. @property
  96. def link(self) -> Link | None:
  97. return None
  98. def page_candidates(self) -> FoundCandidates:
  99. for url in self._path_to_urls.page_candidates:
  100. yield from self._candidates_from_page(Link(url))
  101. def file_links(self) -> FoundLinks:
  102. for url in self._path_to_urls.project_name_to_urls[self._project_name]:
  103. yield Link(url)
  104. class _LocalFileSource(LinkSource):
  105. """``--find-links=<path-or-url>`` or ``--[extra-]index-url=<path-or-url>``.
  106. If a URL is supplied, it must be a ``file:`` URL. If a path is supplied to
  107. the option, it is converted to a URL first. This returns:
  108. * ``page_candidates``: Links listed on an HTML file.
  109. * ``file_candidates``: The non-HTML file.
  110. """
  111. def __init__(
  112. self,
  113. candidates_from_page: CandidatesFromPage,
  114. link: Link,
  115. ) -> None:
  116. self._candidates_from_page = candidates_from_page
  117. self._link = link
  118. @property
  119. def link(self) -> Link | None:
  120. return self._link
  121. def page_candidates(self) -> FoundCandidates:
  122. if not _is_html_file(self._link.url):
  123. return
  124. yield from self._candidates_from_page(self._link)
  125. def file_links(self) -> FoundLinks:
  126. if _is_html_file(self._link.url):
  127. return
  128. yield self._link
  129. class _RemoteFileSource(LinkSource):
  130. """``--find-links=<url>`` or ``--[extra-]index-url=<url>``.
  131. This returns:
  132. * ``page_candidates``: Links listed on an HTML file.
  133. * ``file_candidates``: The non-HTML file.
  134. """
  135. def __init__(
  136. self,
  137. candidates_from_page: CandidatesFromPage,
  138. page_validator: PageValidator,
  139. link: Link,
  140. ) -> None:
  141. self._candidates_from_page = candidates_from_page
  142. self._page_validator = page_validator
  143. self._link = link
  144. @property
  145. def link(self) -> Link | None:
  146. return self._link
  147. def page_candidates(self) -> FoundCandidates:
  148. if not self._page_validator(self._link):
  149. return
  150. yield from self._candidates_from_page(self._link)
  151. def file_links(self) -> FoundLinks:
  152. yield self._link
  153. class _IndexDirectorySource(LinkSource):
  154. """``--[extra-]index-url=<path-to-directory>``.
  155. This is treated like a remote URL; ``candidates_from_page`` contains logic
  156. for this by appending ``index.html`` to the link.
  157. """
  158. def __init__(
  159. self,
  160. candidates_from_page: CandidatesFromPage,
  161. link: Link,
  162. ) -> None:
  163. self._candidates_from_page = candidates_from_page
  164. self._link = link
  165. @property
  166. def link(self) -> Link | None:
  167. return self._link
  168. def page_candidates(self) -> FoundCandidates:
  169. yield from self._candidates_from_page(self._link)
  170. def file_links(self) -> FoundLinks:
  171. return ()
  172. def build_source(
  173. location: str,
  174. *,
  175. candidates_from_page: CandidatesFromPage,
  176. page_validator: PageValidator,
  177. expand_dir: bool,
  178. cache_link_parsing: bool,
  179. project_name: str,
  180. ) -> tuple[str | None, LinkSource | None]:
  181. path: str | None = None
  182. url: str | None = None
  183. if os.path.exists(location): # Is a local path.
  184. url = path_to_url(location)
  185. path = location
  186. elif location.startswith("file:"): # A file: URL.
  187. url = location
  188. path = url_to_path(location)
  189. elif is_url(location):
  190. url = location
  191. if url is None:
  192. msg = (
  193. "Location '%s' is ignored: "
  194. "it is either a non-existing path or lacks a specific scheme."
  195. )
  196. logger.warning(msg, location)
  197. return (None, None)
  198. if path is None:
  199. source: LinkSource = _RemoteFileSource(
  200. candidates_from_page=candidates_from_page,
  201. page_validator=page_validator,
  202. link=Link(url, cache_link_parsing=cache_link_parsing),
  203. )
  204. return (url, source)
  205. if os.path.isdir(path):
  206. if expand_dir:
  207. source = _FlatDirectorySource(
  208. candidates_from_page=candidates_from_page,
  209. path=path,
  210. project_name=project_name,
  211. )
  212. else:
  213. source = _IndexDirectorySource(
  214. candidates_from_page=candidates_from_page,
  215. link=Link(url, cache_link_parsing=cache_link_parsing),
  216. )
  217. return (url, source)
  218. elif os.path.isfile(path):
  219. source = _LocalFileSource(
  220. candidates_from_page=candidates_from_page,
  221. link=Link(url, cache_link_parsing=cache_link_parsing),
  222. )
  223. return (url, source)
  224. logger.warning(
  225. "Location '%s' is ignored: it is neither a file nor a directory.",
  226. location,
  227. )
  228. return (url, None)