python/private/pypi/simpleapi_download.bzl - third_party/github.com/bazelbuild/rules_python - Git at Google

 # Copyright 2024 The Bazel Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 A file that houses private functions used in the `bzlmod` extension with the same name.
 """

 load("@bazel_features//:features.bzl", "bazel_features")
 load("//python/private:auth.bzl", _get_auth = "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
 load("//python/private:text_util.bzl", "render")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")

 def simpleapi_download(
         ctx,
         *,
         attr,
         cache,
         parallel_download = True,
         read_simpleapi = None,
         get_auth = None,
         _fail = fail):
     """Download Simple API HTML.

     Args:
         ctx: The module_ctx or repository_ctx.
         attr: Contains the parameters for the download. They are grouped into a
           struct for better clarity. It must have attributes:
            * index_url: str, the index.
            * index_url_overrides: dict[str, str], the index overrides for
              separate packages.
            * extra_index_urls: Extra index URLs that will be looked up after
              the main is looked up.
            * sources: list[str], the sources to download things for. Each value is
              the contents of requirements files.
            * envsubst: list[str], the envsubst vars for performing substitution in index url.
            * netrc: The netrc parameter for ctx.download, see http_file for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
                http_file for docs.
         cache: A dictionary that can be used as a cache between calls during a
             single evaluation of the extension. We use a dictionary as a cache
             so that we can reuse calls to the simple API when evaluating the
             extension. Using the canonical_id parameter of the module_ctx would
             deposit the simple API responses to the bazel cache and that is
             undesirable because additions to the PyPI index would not be
             reflected when re-evaluating the extension unless we do
             `bazel clean --expunge`.
         parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
         read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
             Used in tests.
         get_auth: A function to get auth information passed to read_simpleapi. Used in tests.
         _fail: a function to print a failure. Used in tests.

     Returns:
         dict of pkg name to the parsed HTML contents - a list of structs.
     """
     index_url_overrides = {
         normalize_name(p): i
         for p, i in (attr.index_url_overrides or {}).items()
     }

     download_kwargs = {}
     if bazel_features.external_deps.download_has_block_param:
         download_kwargs["block"] = not parallel_download

     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
     contents = {}
     index_urls = [attr.index_url] + attr.extra_index_urls
     read_simpleapi = read_simpleapi or _read_simpleapi

     found_on_index = {}
     warn_overrides = False
     for i, index_url in enumerate(index_urls):
         if i != 0:
             # Warn the user about a potential fix for the overrides
             warn_overrides = True

         async_downloads = {}
         sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
         for pkg in sources:
             pkg_normalized = normalize_name(pkg)
             result = read_simpleapi(
                 ctx = ctx,
                 url = "{}/{}/".format(
                     index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
                     pkg,
                 ),
                 attr = attr,
                 cache = cache,
                 get_auth = get_auth,
                 **download_kwargs
             )
             if hasattr(result, "wait"):
                 # We will process it in a separate loop:
                 async_downloads[pkg] = struct(
                     pkg_normalized = pkg_normalized,
                     wait = result.wait,
                 )
             elif result.success:
                 contents[pkg_normalized] = result.output
                 found_on_index[pkg] = index_url

         if not async_downloads:
             continue

         # If we use `block` == False, then we need to have a second loop that is
         # collecting all of the results as they were being downloaded in parallel.
         for pkg, download in async_downloads.items():
             result = download.wait()

             if result.success:
                 contents[download.pkg_normalized] = result.output
                 found_on_index[pkg] = index_url

     failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
     if failed_sources:
         _fail("Failed to download metadata for {} for from urls: {}".format(
             failed_sources,
             index_urls,
         ))
         return None

     if warn_overrides:
         index_url_overrides = {
             pkg: found_on_index[pkg]
             for pkg in attr.sources
             if found_on_index[pkg] != attr.index_url
         }

         # buildifier: disable=print
         print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
             render.dict(index_url_overrides),
         ))

     return contents

 def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
     """Read SimpleAPI.

     Args:
         ctx: The module_ctx or repository_ctx.
         url: str, the url parameter that can be passed to ctx.download.
         attr: The attribute that contains necessary info for downloading. The
           following attributes must be present:
            * envsubst: The envsubst values for performing substitutions in the URL.
            * netrc: The netrc parameter for ctx.download, see http_file for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
                http_file for docs.
         cache: A dict for storing the results.
         get_auth: A function to get auth information. Used in tests.
         **download_kwargs: Any extra params to ctx.download.
             Note that output and auth will be passed for you.

     Returns:
         A similar object to what `download` would return except that in result.out
         will be the parsed simple api contents.
     """
     # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
     # the whl location and we cannot handle multiple URLs at once by passing
     # them to ctx.download if we want to correctly handle the relative URLs.
     # TODO: Add a test that env subbed index urls do not leak into the lock file.

     real_url = strip_empty_path_segments(envsubst(
         url,
         attr.envsubst,
         ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
     ))

     cache_key = real_url
     if cache_key in cache:
         return struct(success = True, output = cache[cache_key])

     output_str = envsubst(
         url,
         attr.envsubst,
         # Use env names in the subst values - this will be unique over
         # the lifetime of the execution of this function and we also use
         # `~` as the separator to ensure that we don't get clashes.
         {e: "~{}~".format(e) for e in attr.envsubst}.get,
     )

     # Transform the URL into a valid filename
     for char in [".", ":", "/", "\\", "-"]:
         output_str = output_str.replace(char, "_")

     output = ctx.path(output_str.strip("_").lower() + ".html")

     get_auth = get_auth or _get_auth

     # NOTE: this may have block = True or block = False in the download_kwargs
     download = ctx.download(
         url = [real_url],
         output = output,
         auth = get_auth(ctx, [real_url], ctx_attr = attr),
         allow_fail = True,
         **download_kwargs
     )

     if download_kwargs.get("block") == False:
         # Simulate the same API as ctx.download has
         return struct(
             wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
         )

     return _read_index_result(ctx, download, output, real_url, cache, cache_key)

 def strip_empty_path_segments(url):
     """Removes empty path segments from a URL. Does nothing for urls with no scheme.

     Public only for testing.

     Args:
         url: The url to remove empty path segments from

     Returns:
         The url with empty path segments removed and any trailing slash preserved.
         If the url had no scheme it is returned unchanged.
     """
     scheme, _, rest = url.partition("://")
     if rest == "":
         return url
     stripped = "/".join([p for p in rest.split("/") if p])
     if url.endswith("/"):
         return "{}://{}/".format(scheme, stripped)
     else:
         return "{}://{}".format(scheme, stripped)

 def _read_index_result(ctx, result, output, url, cache, cache_key):
     if not result.success:
         return struct(success = False)

     content = ctx.read(output)

     output = parse_simpleapi_html(url = url, content = content)
     if output:
         cache.setdefault(cache_key, output)
         return struct(success = True, output = output, cache_key = cache_key)
     else:
         return struct(success = False)
	# Copyright 2024 The Bazel Authors. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	A file that houses private functions used in the `bzlmod` extension with the same name.
	"""

	load("@bazel_features//:features.bzl", "bazel_features")
	load("//python/private:auth.bzl", _get_auth = "get_auth")
	load("//python/private:envsubst.bzl", "envsubst")
	load("//python/private:normalize_name.bzl", "normalize_name")
	load("//python/private:text_util.bzl", "render")
	load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")

	def simpleapi_download(
	ctx,
	*,
	attr,
	cache,
	parallel_download = True,
	read_simpleapi = None,
	get_auth = None,
	_fail = fail):
	"""Download Simple API HTML.

	Args:
	ctx: The module_ctx or repository_ctx.
	attr: Contains the parameters for the download. They are grouped into a
	struct for better clarity. It must have attributes:
	* index_url: str, the index.
	* index_url_overrides: dict[str, str], the index overrides for
	separate packages.
	* extra_index_urls: Extra index URLs that will be looked up after
	the main is looked up.
	* sources: list[str], the sources to download things for. Each value is
	the contents of requirements files.
	* envsubst: list[str], the envsubst vars for performing substitution in index url.
	* netrc: The netrc parameter for ctx.download, see http_file for docs.
	* auth_patterns: The auth_patterns parameter for ctx.download, see
	http_file for docs.
	cache: A dictionary that can be used as a cache between calls during a
	single evaluation of the extension. We use a dictionary as a cache
	so that we can reuse calls to the simple API when evaluating the
	extension. Using the canonical_id parameter of the module_ctx would
	deposit the simple API responses to the bazel cache and that is
	undesirable because additions to the PyPI index would not be
	reflected when re-evaluating the extension unless we do
	`bazel clean --expunge`.
	parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
	read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
	Used in tests.
	get_auth: A function to get auth information passed to read_simpleapi. Used in tests.
	_fail: a function to print a failure. Used in tests.

	Returns:
	dict of pkg name to the parsed HTML contents - a list of structs.
	"""
	index_url_overrides = {
	normalize_name(p): i
	for p, i in (attr.index_url_overrides or {}).items()
	}

	download_kwargs = {}
	if bazel_features.external_deps.download_has_block_param:
	download_kwargs["block"] = not parallel_download

	# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
	# to replicate how `pip` would handle this case.
	contents = {}
	index_urls = [attr.index_url] + attr.extra_index_urls
	read_simpleapi = read_simpleapi or _read_simpleapi

	found_on_index = {}
	warn_overrides = False
	for i, index_url in enumerate(index_urls):
	if i != 0:
	# Warn the user about a potential fix for the overrides
	warn_overrides = True

	async_downloads = {}
	sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
	for pkg in sources:
	pkg_normalized = normalize_name(pkg)
	result = read_simpleapi(
	ctx = ctx,
	url = "{}/{}/".format(
	index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
	pkg,
	),
	attr = attr,
	cache = cache,
	get_auth = get_auth,
	**download_kwargs
	)
	if hasattr(result, "wait"):
	# We will process it in a separate loop:
	async_downloads[pkg] = struct(
	pkg_normalized = pkg_normalized,
	wait = result.wait,
	)
	elif result.success:
	contents[pkg_normalized] = result.output
	found_on_index[pkg] = index_url

	if not async_downloads:
	continue

	# If we use `block` == False, then we need to have a second loop that is
	# collecting all of the results as they were being downloaded in parallel.
	for pkg, download in async_downloads.items():
	result = download.wait()

	if result.success:
	contents[download.pkg_normalized] = result.output
	found_on_index[pkg] = index_url

	failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
	if failed_sources:
	_fail("Failed to download metadata for {} for from urls: {}".format(
	failed_sources,
	index_urls,
	))
	return None

	if warn_overrides:
	index_url_overrides = {
	pkg: found_on_index[pkg]
	for pkg in attr.sources
	if found_on_index[pkg] != attr.index_url
	}

	# buildifier: disable=print
	print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
	render.dict(index_url_overrides),
	))

	return contents

	def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
	"""Read SimpleAPI.

	Args:
	ctx: The module_ctx or repository_ctx.
	url: str, the url parameter that can be passed to ctx.download.
	attr: The attribute that contains necessary info for downloading. The
	following attributes must be present:
	* envsubst: The envsubst values for performing substitutions in the URL.
	* netrc: The netrc parameter for ctx.download, see http_file for docs.
	* auth_patterns: The auth_patterns parameter for ctx.download, see
	http_file for docs.
	cache: A dict for storing the results.
	get_auth: A function to get auth information. Used in tests.
	**download_kwargs: Any extra params to ctx.download.
	Note that output and auth will be passed for you.

	Returns:
	A similar object to what `download` would return except that in result.out
	will be the parsed simple api contents.
	"""
	# NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
	# the whl location and we cannot handle multiple URLs at once by passing
	# them to ctx.download if we want to correctly handle the relative URLs.
	# TODO: Add a test that env subbed index urls do not leak into the lock file.

	real_url = strip_empty_path_segments(envsubst(
	url,
	attr.envsubst,
	ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
	))

	cache_key = real_url
	if cache_key in cache:
	return struct(success = True, output = cache[cache_key])

	output_str = envsubst(
	url,
	attr.envsubst,
	# Use env names in the subst values - this will be unique over
	# the lifetime of the execution of this function and we also use
	# `~` as the separator to ensure that we don't get clashes.
	{e: "~{}~".format(e) for e in attr.envsubst}.get,
	)

	# Transform the URL into a valid filename
	for char in [".", ":", "/", "\\", "-"]:
	output_str = output_str.replace(char, "_")

	output = ctx.path(output_str.strip("_").lower() + ".html")

	get_auth = get_auth or _get_auth

	# NOTE: this may have block = True or block = False in the download_kwargs
	download = ctx.download(
	url = [real_url],
	output = output,
	auth = get_auth(ctx, [real_url], ctx_attr = attr),
	allow_fail = True,
	**download_kwargs
	)

	if download_kwargs.get("block") == False:
	# Simulate the same API as ctx.download has
	return struct(
	wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
	)

	return _read_index_result(ctx, download, output, real_url, cache, cache_key)

	def strip_empty_path_segments(url):
	"""Removes empty path segments from a URL. Does nothing for urls with no scheme.

	Public only for testing.

	Args:
	url: The url to remove empty path segments from

	Returns:
	The url with empty path segments removed and any trailing slash preserved.
	If the url had no scheme it is returned unchanged.
	"""
	scheme, _, rest = url.partition("://")
	if rest == "":
	return url
	stripped = "/".join([p for p in rest.split("/") if p])
	if url.endswith("/"):
	return "{}://{}/".format(scheme, stripped)
	else:
	return "{}://{}".format(scheme, stripped)

	def _read_index_result(ctx, result, output, url, cache, cache_key):
	if not result.success:
	return struct(success = False)

	content = ctx.read(output)

	output = parse_simpleapi_html(url = url, content = content)
	if output:
	cache.setdefault(cache_key, output)
	return struct(success = True, output = output, cache_key = cache_key)
	else:
	return struct(success = False)