Skip to content

Commit

Permalink
feat: filter consumer data based on projects current requires_python
Browse files Browse the repository at this point in the history
This is only done for the top 500 projects with manylinux wheels.
  • Loading branch information
mayeut committed Nov 1, 2024
1 parent 0ca649f commit 1b348cb
Show file tree
Hide file tree
Showing 8 changed files with 2,602 additions and 1,874 deletions.
3,428 changes: 1,560 additions & 1,868 deletions consumer_data/2024/10/31.csv

Large diffs are not rendered by default.

806 changes: 806 additions & 0 deletions filters.json

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ <h2 id="consumer-about">About consumer statistics</h2>
All manylinux wheel downloads from <a href="https://pypi.org/">PyPI</a> using pip are analysed each day to compute those statistics.
The data set is smoothed using a 1-month sliding window algorithm.
</p>
<p>
Starting 2024-11-01, downloads using a python version not supported by the latest version of a package (based mostly on requires_python metadata) are excluded from the data set.
Overall download statistics per python version can be found on other websites, though, likely not restricted to manylinux wheels and/or Linux.
</p>
</div>
<div class="col-sm-6">
<h2 id="consumer-glibc-readiness-3.14">glibc readiness for python 3.14 (Preview)</h2>
Expand Down
5 changes: 5 additions & 0 deletions packages.json
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,7 @@
"cpp-demangle",
"cpp-meraki",
"cpp-package-for-python",
"cpp-tiff",
"cppbinder",
"cppbktree",
"cppbuiltins",
Expand Down Expand Up @@ -1955,6 +1956,7 @@
"defity",
"deflate",
"degirum",
"deglib",
"del-fem",
"del-msh",
"delphifmx",
Expand Down Expand Up @@ -7583,6 +7585,7 @@
"pykeyvi",
"pykingas",
"pykk",
"pyklang",
"pykmertools",
"pykonal",
"pykooh",
Expand Down Expand Up @@ -9653,6 +9656,7 @@
"sipm",
"siqadtools",
"siren",
"siri-parser",
"sisl",
"sizestr",
"skapex",
Expand Down Expand Up @@ -11105,6 +11109,7 @@
"vplanet",
"vpolo",
"vpsearch",
"vptq",
"vpx-rtp-py",
"vpython",
"vqf",
Expand Down
16 changes: 15 additions & 1 deletion update.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import update_consumer_data
import update_consumer_stats
import update_dataset
import update_filters
import update_stats
import utils

Expand All @@ -25,7 +26,7 @@ def check_file(value):
return result


if __name__ == "__main__":
def main() -> None:
default_end = date.today() - timedelta(days=1)
default_start = default_end - timedelta(days=365 * 2)

Expand All @@ -38,6 +39,11 @@ def check_file(value):
action="store_true",
help="check all packages in PyPI.",
)
parser.add_argument(
"--update-filters",
action="store_true",
help="check minimum python version for each of the top 8000 PyPI packages.",
)
parser.add_argument(
"-s",
"--start",
Expand Down Expand Up @@ -95,6 +101,7 @@ def check_file(value):

if not args.skip_cache:
packages = update_cache.update(packages, args.all_pypi_packages)

packages, rows = update_dataset.update(packages)
with open(utils.ROOT_PATH / "packages.json", "w") as f:
json.dump(packages, f, indent=0)
Expand All @@ -104,3 +111,10 @@ def check_file(value):
copy(utils.ROOT_PATH / "style.css", utils.BUILD_PATH)
copy(utils.ROOT_PATH / "favicon.ico", utils.BUILD_PATH)
copy(utils.ROOT_PATH / ".gitignore", utils.BUILD_PATH)

if args.update_filters:
update_filters.update()


if __name__ == "__main__":
main()
41 changes: 38 additions & 3 deletions update_consumer_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from google.api_core.exceptions import Forbidden, GoogleAPIError
from google.cloud import bigquery

import utils

_LOGGER = logging.getLogger(__name__)
BIGQUERY_TOKEN = "BIGQUERY_TOKEN"

Expand All @@ -23,7 +25,37 @@ def _update_consumer_data(path: Path, bigquery_credentials: Path | None) -> None
return

_LOGGER.info(f"bigquery: fetching downloads for {table_suffix}")
query = rf"""
filters = []
for filter_ in json.loads(utils.ROOT_PATH.joinpath("filters.json").read_text()):
name, python_version = filter_.split("-")
major, minor = python_version.split(".")
filter_condition = (
f' else if (fn.startsWith("{name}-")) '
f"{{ major = {major}; minor = {minor}; }}"
)
filters.append(filter_condition)
if len(filters) >= 500:
break
query = rf'''
CREATE TEMP FUNCTION check_min_python_version(filename STRING, python_version STRING)
RETURNS BOOL
LANGUAGE js
AS r"""
var major = 2;
var minor = 0;
const fn = filename.toLowerCase();
if (false) {{ }}
{"\n".join(filters)}
if ((major == 2) && (minor == 0)) return true;
const parts = /^(?<major>\d+)\.(?<minor>\d+).*/.exec(python_version);
if (!parts) return true;
python_major = parseInt(parts.groups["major"], 10);
python_minor = parseInt(parts.groups["minor"], 10);
return (python_major > major) ||
((python_major == major) && (python_minor >= minor));
""";
SELECT t0.cpu, t0.num_downloads, t0.python_version, t0.pip_version, t0.glibc_version
FROM (SELECT COUNT(*) AS num_downloads,
REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") as python_version,
Expand All @@ -34,10 +66,12 @@ def _update_consumer_data(path: Path, bigquery_credentials: Path | None) -> None
TIMESTAMP("{table_suffix} 23:59:59.999999 UTC") AND
details.installer.name = "pip" AND details.system.name = "Linux" AND
details.distro.libc.lib = "glibc" AND
REGEXP_CONTAINS(file.filename, r"-manylinux([0-9a-zA-Z_]+)\.whl")
REGEXP_CONTAINS(file.filename, r"-manylinux([0-9a-zA-Z_]+)\.whl") AND
check_min_python_version(file.filename, details.python)
GROUP BY pip_version, python_version, glibc_version, details.cpu
ORDER BY num_downloads DESC) AS t0;
"""
'''

with TemporaryDirectory() as temp:
if bigquery_credentials is None:
bigquery_credentials = Path(temp) / "key.json"
Expand Down Expand Up @@ -67,6 +101,7 @@ def _update_consumer_data(path: Path, bigquery_credentials: Path | None) -> None
return
if query_job.cache_hit:
_LOGGER.debug("bigquery: using cached results")
_LOGGER.info(f"bigquery: {query_job.total_bytes_billed // 1000000000} GB billed")
with file.open("w") as f:
f.write(",".join([f.name for f in rows.schema]) + "\n")
for row in rows:
Expand Down
4 changes: 2 additions & 2 deletions update_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _filter_versions(package: str, info: dict) -> list[str]:
return filtered


def _parse_version(files: list[dict[str, str]]) -> tuple[date, str, str]:
def parse_version(files: list[dict[str, str]]) -> tuple[date, str, str]:
upload_date = date.max.isoformat()
pythons = set()
manylinux = set()
Expand Down Expand Up @@ -111,7 +111,7 @@ def _package_update(package: str) -> list[utils.Row]:
_LOGGER.debug(f'"{package}": using "{versions}"')
rows = []
for version in versions:
week, python, manylinux = _parse_version(info["releases"][version])
week, python, manylinux = parse_version(info["releases"][version])
if python == "" or manylinux == "":
continue
rows.append(utils.Row(week, package, version, python, manylinux))
Expand Down
172 changes: 172 additions & 0 deletions update_filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import gzip
import json
import logging
import sqlite3
from contextlib import contextmanager
from pathlib import Path

import requests
from packaging.specifiers import InvalidSpecifier, SpecifierSet
from packaging.utils import canonicalize_name
from packaging.version import InvalidVersion, Version

import update_dataset
import utils

_LOGGER = logging.getLogger(__name__)


@contextmanager
def sqlite3_connect(path: Path):
try:
con = sqlite3.connect(path)
yield con
finally:
con.close()


def _filter_versions(package: str, info: dict) -> str | None:
candidate_versions = []
for version in info["releases"].keys():
try:
version_pep = Version(version)
if version_pep.is_prerelease:
_LOGGER.debug(f'"{package}": ignore pre-release {version}')
continue
candidate_versions.append((version, version_pep))
except InvalidVersion as e:
_LOGGER.warning(f'"{package}": {e}')

if not candidate_versions:
return None

candidate_versions.sort(key=lambda x: x[1], reverse=True)

return candidate_versions[0][0]


def _get_filter(
files: list[dict[str, str]], last_python_requires: str | None
) -> str | None:
names: set[str] = set()
requires_pythons: set[SpecifierSet] = set()
for file in files:
filename = file["filename"]
if not filename.lower().endswith(".whl"):
continue
parsed_filename = utils.WHEEL_INFO_RE.match(filename)
if parsed_filename is None:
continue
metadata = utils.WheelMetadata(*parsed_filename.groups()[1:])
names.add(metadata.name.lower())
if file["requires_python"]:
fixup_requires_python = file["requires_python"]
fixup_requires_python = fixup_requires_python.replace(".*", "")
fixup_requires_python = fixup_requires_python.replace("*", "")
fixup_requires_python = fixup_requires_python.replace('"', "")
fixup_requires_python = fixup_requires_python.replace("0<", "0,<")
fixup_requires_python = fixup_requires_python.replace("3<", "3,<")
try:
requires_python = SpecifierSet(fixup_requires_python)
requires_pythons.add(requires_python)
except InvalidSpecifier:
specifier_set = file["requires_python"]
_LOGGER.warning(
f'invalid requires_python "{specifier_set}" for wheel "{filename}"'
)

if not names:
return None

assert len(names) == 1
name = names.pop()
python = "2.0"

def _get_min_python(spec_sets: set[SpecifierSet]):
for minor in range(6, 8):
if any(f"2.{minor}" in spec_set for spec_set in spec_sets):
return f"2.{minor}"
for minor in range(0, 99):
if any(f"3.{minor}" in spec_set for spec_set in spec_sets):
return f"3.{minor}"
return python

if requires_pythons:
python = _get_min_python(requires_pythons)
else:
# reuse update_dataset parsing
_, pythons_str, _ = update_dataset.parse_version(files)
pythons = pythons_str.split(".")
if pythons[0] == "abi3":
del pythons[0]
if pythons[0] == "py2":
python = "2.0"
elif pythons[0] == "py32":
python = "3.0"
else:
python = f"{pythons[0][2]}.{pythons[0][3:]}"
python = python

if last_python_requires:
last_set = SpecifierSet(last_python_requires)
if python not in last_set:
python = _get_min_python({last_set})

result = f"{name}-{python}"
overrides = {
"cython-2.7": "cython-3.6", # no wheels below 3.6
"opencv_python-3.6": "opencv_python-3.7", # no wheels below 3.7
"visualdl-2.7": "visualdl-3.0", # pure wheel, no requires_python
"parallel_ssh-2.7": "parallel_ssh-3.0", # pure wheel, no requires_python
}
return overrides.get(result, result)


def update() -> None:
pypi_data_version = "2024.10.08"
pypi_data_cache = utils.CACHE_PATH / f"pypi-{pypi_data_version}.db"
if not pypi_data_cache.exists():
_LOGGER.info("pypi data: download")
db_url = (
"https://github.com/sethmlarson/pypi-data/releases/download/"
f"{pypi_data_version}/pypi.db.gz"
)
response = requests.get(db_url)
response.raise_for_status()
_LOGGER.info("pypi data: decompressing")
pypi_data_cache.write_bytes(gzip.decompress(response.content))
response = requests.get(
"https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
)
response.raise_for_status()
top_packages_data = response.json()
rows = sorted(
top_packages_data["rows"], key=lambda x: x["download_count"], reverse=True
)
top_packages = [row["project"] for row in rows]
filters = []
with sqlite3_connect(pypi_data_cache) as con:
for package in top_packages:
package_norm = canonicalize_name(package)
cache_file = utils.get_release_cache_path(package_norm)
if not cache_file.exists():
continue
with open(cache_file) as f:
info = json.load(f)
version = _filter_versions(package, info)
if version is None:
continue
query = "SELECT requires_python FROM packages WHERE name = ?"
cur = con.execute(query, (package,))
res = cur.fetchone()
cur.close()
assert res is not None
python_requires = res[0]

filter_ = _get_filter(info["releases"][version], python_requires)
if filter_ is None:
continue
filters.append(filter_)
with open(utils.ROOT_PATH / "filters.json", "w") as f:
json.dump(filters, f, indent=0)
f.write("\n")

0 comments on commit 1b348cb

Please sign in to comment.