feat: filter consumer data based on projects current requires_python

This is only done for the top 500 projects with manylinux wheels.
mayeut · Nov 1, 2024 · 1b348cb · 1b348cb
1 parent 0ca649f
commit 1b348cb
Show file tree

Hide file tree

Showing 8 changed files with 2,602 additions and 1,874 deletions.
diff --git a/consumer_data/2024/10/31.csv b/consumer_data/2024/10/31.csv
diff --git a/filters.json b/filters.json
diff --git a/index.html b/index.html
@@ -88,6 +88,10 @@ <h2 id="consumer-about">About consumer statistics</h2>
                 All manylinux wheel downloads from <a href="https://pypi.org/">PyPI</a> using pip are analysed each day to compute those statistics.
                 The data set is smoothed using a 1-month sliding window algorithm.
             </p>
+            <p>
+                Starting 2024-11-01, downloads using a python version not supported by the latest version of a package (based mostly on requires_python metadata) are excluded from the data set.
+                Overall download statistics per python version can be found on other websites, though, likely not restricted to manylinux wheels and/or Linux.
+            </p>
         </div>
         <div class="col-sm-6">
             <h2 id="consumer-glibc-readiness-3.14">glibc readiness for python 3.14 (Preview)</h2>

diff --git a/packages.json b/packages.json
@@ -1508,6 +1508,7 @@
 "cpp-demangle",
 "cpp-meraki",
 "cpp-package-for-python",
+"cpp-tiff",
 "cppbinder",
 "cppbktree",
 "cppbuiltins",
@@ -1955,6 +1956,7 @@
 "defity",
 "deflate",
 "degirum",
+"deglib",
 "del-fem",
 "del-msh",
 "delphifmx",
@@ -7583,6 +7585,7 @@
 "pykeyvi",
 "pykingas",
 "pykk",
+"pyklang",
 "pykmertools",
 "pykonal",
 "pykooh",
@@ -9653,6 +9656,7 @@
 "sipm",
 "siqadtools",
 "siren",
+"siri-parser",
 "sisl",
 "sizestr",
 "skapex",
@@ -11105,6 +11109,7 @@
 "vplanet",
 "vpolo",
 "vpsearch",
+"vptq",
 "vpx-rtp-py",
 "vpython",
 "vqf",

diff --git a/update.py b/update.py
@@ -10,6 +10,7 @@
 import update_consumer_data
 import update_consumer_stats
 import update_dataset
+import update_filters
 import update_stats
 import utils
 
@@ -25,7 +26,7 @@ def check_file(value):
     return result
 
 
-if __name__ == "__main__":
+def main() -> None:
     default_end = date.today() - timedelta(days=1)
     default_start = default_end - timedelta(days=365 * 2)
 
@@ -38,6 +39,11 @@ def check_file(value):
         action="store_true",
         help="check all packages in PyPI.",
     )
+    parser.add_argument(
+        "--update-filters",
+        action="store_true",
+        help="check minimum python version for each of the top 8000 PyPI packages.",
+    )
     parser.add_argument(
         "-s",
         "--start",
@@ -95,6 +101,7 @@ def check_file(value):
 
     if not args.skip_cache:
         packages = update_cache.update(packages, args.all_pypi_packages)
+
     packages, rows = update_dataset.update(packages)
     with open(utils.ROOT_PATH / "packages.json", "w") as f:
         json.dump(packages, f, indent=0)
@@ -104,3 +111,10 @@ def check_file(value):
     copy(utils.ROOT_PATH / "style.css", utils.BUILD_PATH)
     copy(utils.ROOT_PATH / "favicon.ico", utils.BUILD_PATH)
     copy(utils.ROOT_PATH / ".gitignore", utils.BUILD_PATH)
+
+    if args.update_filters:
+        update_filters.update()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/update_consumer_data.py b/update_consumer_data.py
@@ -8,6 +8,8 @@
 from google.api_core.exceptions import Forbidden, GoogleAPIError
 from google.cloud import bigquery
 
+import utils
+
 _LOGGER = logging.getLogger(__name__)
 BIGQUERY_TOKEN = "BIGQUERY_TOKEN"
 
@@ -23,7 +25,37 @@ def _update_consumer_data(path: Path, bigquery_credentials: Path | None) -> None
         return
 
     _LOGGER.info(f"bigquery: fetching downloads for {table_suffix}")
-    query = rf"""
+    filters = []
+    for filter_ in json.loads(utils.ROOT_PATH.joinpath("filters.json").read_text()):
+        name, python_version = filter_.split("-")
+        major, minor = python_version.split(".")
+        filter_condition = (
+            f'  else if (fn.startsWith("{name}-")) '
+            f"{{ major = {major}; minor = {minor}; }}"
+        )
+        filters.append(filter_condition)
+        if len(filters) >= 500:
+            break
+    query = rf'''
+CREATE TEMP FUNCTION check_min_python_version(filename STRING, python_version STRING)
+RETURNS BOOL
+LANGUAGE js
+AS r"""
+  var major = 2;
+  var minor = 0;
+  const fn = filename.toLowerCase();
+  if (false) {{ }}
+{"\n".join(filters)}
+
+  if ((major == 2) && (minor == 0)) return true;
+  const parts = /^(?<major>\d+)\.(?<minor>\d+).*/.exec(python_version);
+  if (!parts) return true;
+  python_major = parseInt(parts.groups["major"], 10);
+  python_minor = parseInt(parts.groups["minor"], 10);
+  return (python_major > major) ||
+         ((python_major == major) && (python_minor >= minor));
+""";
+
 SELECT t0.cpu, t0.num_downloads, t0.python_version, t0.pip_version, t0.glibc_version
 FROM (SELECT COUNT(*) AS num_downloads,
 REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") as python_version,
@@ -34,10 +66,12 @@ def _update_consumer_data(path: Path, bigquery_credentials: Path | None) -> None
 TIMESTAMP("{table_suffix} 23:59:59.999999 UTC") AND
 details.installer.name = "pip" AND details.system.name = "Linux" AND
 details.distro.libc.lib = "glibc" AND
-REGEXP_CONTAINS(file.filename, r"-manylinux([0-9a-zA-Z_]+)\.whl")
+REGEXP_CONTAINS(file.filename, r"-manylinux([0-9a-zA-Z_]+)\.whl") AND
+check_min_python_version(file.filename, details.python)
 GROUP BY pip_version, python_version, glibc_version, details.cpu
 ORDER BY num_downloads DESC) AS t0;
-"""
+'''
+
     with TemporaryDirectory() as temp:
         if bigquery_credentials is None:
             bigquery_credentials = Path(temp) / "key.json"
@@ -67,6 +101,7 @@ def _update_consumer_data(path: Path, bigquery_credentials: Path | None) -> None
         return
     if query_job.cache_hit:
         _LOGGER.debug("bigquery: using cached results")
+    _LOGGER.info(f"bigquery: {query_job.total_bytes_billed // 1000000000} GB billed")
     with file.open("w") as f:
         f.write(",".join([f.name for f in rows.schema]) + "\n")
         for row in rows:

diff --git a/update_dataset.py b/update_dataset.py
@@ -34,7 +34,7 @@ def _filter_versions(package: str, info: dict) -> list[str]:
     return filtered
 
 
-def _parse_version(files: list[dict[str, str]]) -> tuple[date, str, str]:
+def parse_version(files: list[dict[str, str]]) -> tuple[date, str, str]:
     upload_date = date.max.isoformat()
     pythons = set()
     manylinux = set()
@@ -111,7 +111,7 @@ def _package_update(package: str) -> list[utils.Row]:
     _LOGGER.debug(f'"{package}": using "{versions}"')
     rows = []
     for version in versions:
-        week, python, manylinux = _parse_version(info["releases"][version])
+        week, python, manylinux = parse_version(info["releases"][version])
         if python == "" or manylinux == "":
             continue
         rows.append(utils.Row(week, package, version, python, manylinux))

diff --git a/update_filters.py b/update_filters.py
@@ -0,0 +1,172 @@
+import gzip
+import json
+import logging
+import sqlite3
+from contextlib import contextmanager
+from pathlib import Path
+
+import requests
+from packaging.specifiers import InvalidSpecifier, SpecifierSet
+from packaging.utils import canonicalize_name
+from packaging.version import InvalidVersion, Version
+
+import update_dataset
+import utils
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@contextmanager
+def sqlite3_connect(path: Path):
+    try:
+        con = sqlite3.connect(path)
+        yield con
+    finally:
+        con.close()
+
+
+def _filter_versions(package: str, info: dict) -> str | None:
+    candidate_versions = []
+    for version in info["releases"].keys():
+        try:
+            version_pep = Version(version)
+            if version_pep.is_prerelease:
+                _LOGGER.debug(f'"{package}": ignore pre-release {version}')
+                continue
+            candidate_versions.append((version, version_pep))
+        except InvalidVersion as e:
+            _LOGGER.warning(f'"{package}": {e}')
+
+    if not candidate_versions:
+        return None
+
+    candidate_versions.sort(key=lambda x: x[1], reverse=True)
+
+    return candidate_versions[0][0]
+
+
+def _get_filter(
+    files: list[dict[str, str]], last_python_requires: str | None
+) -> str | None:
+    names: set[str] = set()
+    requires_pythons: set[SpecifierSet] = set()
+    for file in files:
+        filename = file["filename"]
+        if not filename.lower().endswith(".whl"):
+            continue
+        parsed_filename = utils.WHEEL_INFO_RE.match(filename)
+        if parsed_filename is None:
+            continue
+        metadata = utils.WheelMetadata(*parsed_filename.groups()[1:])
+        names.add(metadata.name.lower())
+        if file["requires_python"]:
+            fixup_requires_python = file["requires_python"]
+            fixup_requires_python = fixup_requires_python.replace(".*", "")
+            fixup_requires_python = fixup_requires_python.replace("*", "")
+            fixup_requires_python = fixup_requires_python.replace('"', "")
+            fixup_requires_python = fixup_requires_python.replace("0<", "0,<")
+            fixup_requires_python = fixup_requires_python.replace("3<", "3,<")
+            try:
+                requires_python = SpecifierSet(fixup_requires_python)
+                requires_pythons.add(requires_python)
+            except InvalidSpecifier:
+                specifier_set = file["requires_python"]
+                _LOGGER.warning(
+                    f'invalid requires_python "{specifier_set}" for wheel "{filename}"'
+                )
+
+    if not names:
+        return None
+
+    assert len(names) == 1
+    name = names.pop()
+    python = "2.0"
+
+    def _get_min_python(spec_sets: set[SpecifierSet]):
+        for minor in range(6, 8):
+            if any(f"2.{minor}" in spec_set for spec_set in spec_sets):
+                return f"2.{minor}"
+        for minor in range(0, 99):
+            if any(f"3.{minor}" in spec_set for spec_set in spec_sets):
+                return f"3.{minor}"
+        return python
+
+    if requires_pythons:
+        python = _get_min_python(requires_pythons)
+    else:
+        # reuse update_dataset parsing
+        _, pythons_str, _ = update_dataset.parse_version(files)
+        pythons = pythons_str.split(".")
+        if pythons[0] == "abi3":
+            del pythons[0]
+        if pythons[0] == "py2":
+            python = "2.0"
+        elif pythons[0] == "py32":
+            python = "3.0"
+        else:
+            python = f"{pythons[0][2]}.{pythons[0][3:]}"
+        python = python
+
+    if last_python_requires:
+        last_set = SpecifierSet(last_python_requires)
+        if python not in last_set:
+            python = _get_min_python({last_set})
+
+    result = f"{name}-{python}"
+    overrides = {
+        "cython-2.7": "cython-3.6",  # no wheels below 3.6
+        "opencv_python-3.6": "opencv_python-3.7",  # no wheels below 3.7
+        "visualdl-2.7": "visualdl-3.0",  # pure wheel, no requires_python
+        "parallel_ssh-2.7": "parallel_ssh-3.0",  # pure wheel, no requires_python
+    }
+    return overrides.get(result, result)
+
+
+def update() -> None:
+    pypi_data_version = "2024.10.08"
+    pypi_data_cache = utils.CACHE_PATH / f"pypi-{pypi_data_version}.db"
+    if not pypi_data_cache.exists():
+        _LOGGER.info("pypi data: download")
+        db_url = (
+            "https://github.com/sethmlarson/pypi-data/releases/download/"
+            f"{pypi_data_version}/pypi.db.gz"
+        )
+        response = requests.get(db_url)
+        response.raise_for_status()
+        _LOGGER.info("pypi data: decompressing")
+        pypi_data_cache.write_bytes(gzip.decompress(response.content))
+    response = requests.get(
+        "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
+    )
+    response.raise_for_status()
+    top_packages_data = response.json()
+    rows = sorted(
+        top_packages_data["rows"], key=lambda x: x["download_count"], reverse=True
+    )
+    top_packages = [row["project"] for row in rows]
+    filters = []
+    with sqlite3_connect(pypi_data_cache) as con:
+        for package in top_packages:
+            package_norm = canonicalize_name(package)
+            cache_file = utils.get_release_cache_path(package_norm)
+            if not cache_file.exists():
+                continue
+            with open(cache_file) as f:
+                info = json.load(f)
+            version = _filter_versions(package, info)
+            if version is None:
+                continue
+            query = "SELECT requires_python FROM packages WHERE name = ?"
+            cur = con.execute(query, (package,))
+            res = cur.fetchone()
+            cur.close()
+            assert res is not None
+            python_requires = res[0]
+
+            filter_ = _get_filter(info["releases"][version], python_requires)
+            if filter_ is None:
+                continue
+            filters.append(filter_)
+    with open(utils.ROOT_PATH / "filters.json", "w") as f:
+        json.dump(filters, f, indent=0)
+        f.write("\n")