Skip to content

Commit

Permalink
bugfix pii emails and quality filters default args
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed May 2, 2024
1 parent a8d21e2 commit c72b1e4
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 5 deletions.
3 changes: 2 additions & 1 deletion src/datatrove/pipeline/filters/fineweb_quality_filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datatrove.pipeline.filters.base_filter import BaseFilter
from datatrove.pipeline.filters.gopher_repetition_filter import find_duplicates
from datatrove.pipeline.writers.disk_base import DiskWriter


class FineWebQualityFilter(BaseFilter):
Expand All @@ -8,7 +9,7 @@ class FineWebQualityFilter(BaseFilter):

def __init__(
self,
exclusion_writer,
exclusion_writer: DiskWriter = None,
line_punct_thr: float = 0.12,
line_punct_exclude_zero: bool = False,
short_line_thr: float = 0.67,
Expand Down
8 changes: 4 additions & 4 deletions src/datatrove/pipeline/formatters/pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import partial
from typing import Callable

from .base import BaseFormatter
from datatrove.pipeline.formatters.base import BaseFormatter


class PIIReplacer:
Expand Down Expand Up @@ -74,9 +74,9 @@ def __init__(
self.remove_ips = remove_ips

self.emails_replacer = PIIReplacer(
r"\b[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[a-z0-9](?:[a-z0-9-]*["
r"a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25["
r"0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:)])",
r"\b[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9](?:["
r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|["
r"01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[A-Za-z0-9-]*[A-Za-z0-9]:)])",
email_replacement,
)

Expand Down
2 changes: 2 additions & 0 deletions tests/pipeline/test_pii_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
email@example.museum
email@example.co.jp
firstname-lastname@example.com
NAME@MYSITE.COM
Expand Down Expand Up @@ -161,6 +162,7 @@
EMAIL
EMAIL
EMAIL
EMAIL
Expand Down

0 comments on commit c72b1e4

Please sign in to comment.