Skip to content

Commit

Permalink
Improve cleaner performance (#79)
Browse files Browse the repository at this point in the history
* Improve cleaner performance

* bug fix

* cleanup

* cleanup

* recursive

* cleanup

* improve cleaner

* add error

* no withdirs

* fix 3.8 typing
  • Loading branch information
malmans2 authored Apr 7, 2023
1 parent 7fc048d commit b5981e4
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .cruft.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"template": "https://github.com/ecmwf-projects/cookiecutter-conda-package",
"commit": "9fc05230b33d1c3711da8f38a5368ea65c95c2b1",
"commit": "23164e60b3e44472716d3d46bcd3ef47bbf2e79a",
"checkout": null,
"context": {
"cookiecutter": {
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
- id: debug-statements
- id: mixed-line-ending
- repo: https://github.com/psf/black
rev: 23.1.0
rev: 23.3.0
hooks:
- id: black
- repo: https://github.com/keewis/blackdoc
Expand All @@ -20,7 +20,7 @@ repos:
- id: blackdoc
additional_dependencies: [black==22.3.0]
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.0.259
rev: v0.0.260
hooks:
- id: ruff
args: [--fix]
Expand Down
58 changes: 43 additions & 15 deletions cacholote/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import datetime
import functools
import json
Expand Down Expand Up @@ -60,7 +61,10 @@ def _delete_cache_file(
with utils._Locker(fs, urlpath) as file_exists:
if file_exists:
logger.info("Delete cache file", urlpath=urlpath)
fs.rm(urlpath, recursive=True)
fs.rm(
urlpath,
recursive=obj["args"][0]["type"] == "application/vnd+zarr",
)

return obj

Expand All @@ -78,6 +82,17 @@ def _delete_cache_entry(
def delete(
func_to_del: Union[str, Callable[..., Any]], *args: Any, **kwargs: Any
) -> None:
"""Delete function previously cached.
Parameters
----------
func_to_del: callable, str
Function to delete from cache
*args: Any
Argument of functions to delete from cache
**kwargs: Any
Keyword arguments of functions to delete from cache
"""
hexdigest = encode._hexdigestify_python_call(func_to_del, *args, **kwargs)
with config.get().sessionmaker() as session:
for cache_entry in session.query(database.CacheEntry).filter(
Expand All @@ -88,25 +103,34 @@ def delete(

class _Cleaner:
def __init__(self, logger: structlog.stdlib.BoundLogger) -> None:
self.logger = logger

fs, dirname = utils.get_cache_files_fs_dirname()
sizes = {fs.unstrip_protocol(path): fs.du(path) for path in fs.ls(dirname)}
urldir = fs.unstrip_protocol(dirname)

logger.info("Get disk usage of cache files")
sizes: Dict[str, int] = collections.defaultdict(lambda: 0)
for path, size in fs.du(dirname, total=False).items():
# Group dirs
urlpath = fs.unstrip_protocol(path)
basename, *_ = urlpath.replace(urldir, "", 1).strip("/").split("/")
if basename:
sizes[posixpath.join(urldir, basename)] += size

self.logger = logger
self.fs = fs
self.dirname = dirname
self.sizes = sizes

@property
def size(self) -> int:
return sum(self.sizes.values())
sum_sizes = sum(self.sizes.values())
self.logger.info("Check cache files total size", size=sum_sizes)
return sum_sizes

def stop_cleaning(self, maxsize: int) -> bool:
size = self.size
self.logger.info("Check cache files size", size=self.size)
return size <= maxsize
return self.size <= maxsize

def get_unknown_files(self, lock_validity_period: Optional[float]) -> Set[str]:
self.logger.info("Get unknown files")
now = datetime.datetime.now()
files_to_skip = []
for urlpath in self.sizes:
Expand All @@ -133,16 +157,17 @@ def get_unknown_files(self, lock_validity_period: Optional[float]) -> Set[str]:
)
return set(unknown_sizes)

def delete_unknown_files(self, lock_validity_period: Optional[float]) -> None:
def delete_unknown_files(
self, lock_validity_period: Optional[float], recursive: bool
) -> None:
for urlpath in self.get_unknown_files(lock_validity_period):
self.sizes.pop(urlpath)
if not self.fs.exists(urlpath):
continue

with utils._Locker(self.fs, urlpath, lock_validity_period) as file_exists:
if file_exists:
self.logger.info("Delete unkown file", urlpath=urlpath)
self.fs.rm(urlpath)
self.logger.info(
"Delete unknown", urlpath=urlpath, recursive=recursive
)
self.fs.rm(urlpath, recursive=recursive)

@staticmethod
def check_tags(*args: Any) -> None:
Expand Down Expand Up @@ -225,6 +250,7 @@ def clean_cache_files(
maxsize: int,
method: Literal["LRU", "LFU"] = "LRU",
delete_unknown_files: bool = False,
recursive: bool = False,
lock_validity_period: Optional[float] = None,
tags_to_clean: Optional[Sequence[Optional[str]]] = None,
tags_to_keep: Optional[Sequence[Optional[str]]] = None,
Expand All @@ -241,6 +267,8 @@ def clean_cache_files(
* LFU: Least Frequently Used
delete_unknown_files: bool, default: False
Delete all files that are not registered in the cache database.
recursive: bool, default: False
Whether to delete unknown directories or not
lock_validity_period: float, optional, default: None
Validity period of lock files in seconds. Expired locks will be deleted.
tags_to_clean, tags_to_keep: sequence of strings/None, optional, default: None
Expand All @@ -253,7 +281,7 @@ def clean_cache_files(
cleaner = _Cleaner(logger=logger or LOGGER)

if delete_unknown_files:
cleaner.delete_unknown_files(lock_validity_period)
cleaner.delete_unknown_files(lock_validity_period, recursive)

cleaner.delete_cache_files(
maxsize=maxsize,
Expand Down
31 changes: 25 additions & 6 deletions tests/test_60_clean.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import contextlib
import pathlib
from typing import Any, Literal, Optional, Sequence

Expand All @@ -6,6 +7,8 @@

from cacholote import cache, clean, config, utils

does_not_raise = contextlib.nullcontext


@cache.cacheable
def open_url(url: pathlib.Path) -> fsspec.spec.AbstractBufferedFile:
Expand Down Expand Up @@ -76,6 +79,24 @@ def test_delete_unknown_files(tmpdir: pathlib.Path, delete_unknown_files: bool)
assert fs.ls(dirname) == [f"{dirname}/unknown.txt"]


@pytest.mark.parametrize(
"recursive,raises,final_size",
[
(True, does_not_raise(), 0),
(False, pytest.raises((PermissionError, IsADirectoryError)), 1),
],
)
def test_delete_unknown_dirs(
recursive: bool, raises: contextlib.nullcontext, final_size: int # type: ignore[type-arg]
) -> None:
fs, dirname = utils.get_cache_files_fs_dirname()
fs.mkdir(f"{dirname}/unknown")
fs.touch(f"{dirname}/unknown/unknown.txt")
with raises:
clean.clean_cache_files(0, delete_unknown_files=True, recursive=recursive)
assert len(fs.ls(dirname)) == final_size


@pytest.mark.parametrize("lock_validity_period", [None, 0])
def test_clean_locked_files(
tmpdir: pathlib.Path, lock_validity_period: Optional[float]
Expand Down Expand Up @@ -147,15 +168,13 @@ def test_clean_tagged_files_wrong_args() -> None:

@pytest.mark.parametrize("wrong_type", ["1", [1]])
def test_clean_tagged_files_wrong_types(wrong_type: Any) -> None:
with pytest.raises(
raises = pytest.raises(
TypeError,
match="tags_to_clean/keep must be None or a sequence of str/None.",
):
)
with raises:
clean.clean_cache_files(1, tags_to_keep=wrong_type)
with pytest.raises(
TypeError,
match="tags_to_clean/keep must be None or a sequence of str/None.",
):
with raises:
clean.clean_cache_files(1, tags_to_clean=wrong_type)


Expand Down

0 comments on commit b5981e4

Please sign in to comment.