From a78782ea931c9f2520caffc8aad88c7d997625b4 Mon Sep 17 00:00:00 2001 From: fazeelghafoor Date: Thu, 1 Aug 2024 23:27:16 +0500 Subject: [PATCH 1/7] handle MultipleObjectsReturned for duplicates after TAGGIT_CASE_INSENSITIVE settings is set to True --- taggit/managers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/taggit/managers.py b/taggit/managers.py index 0ff8e976..377b27c5 100644 --- a/taggit/managers.py +++ b/taggit/managers.py @@ -4,6 +4,7 @@ from django.conf import settings from django.contrib.contenttypes.fields import GenericRelation from django.contrib.contenttypes.models import ContentType +from django.core.exceptions import MultipleObjectsReturned from django.db import connections, models, router from django.db.models import signals from django.db.models.fields.related import ( @@ -242,6 +243,9 @@ def _to_tag_model_instances(self, tags, tag_kwargs): existing_tags_for_str[name] = tag except self.through.tag_model().DoesNotExist: tags_to_create.append(name) + except MultipleObjectsReturned: + tag = manager.filter(name__iexact=name, **tag_kwargs).first() + existing_tags_for_str[name] = tag else: # Django is smart enough to not actually query if tag_strs is empty # but importantly, this is a single query for all potential tags From e0ebafb30ce683c573e94645f3bf9320db4465c0 Mon Sep 17 00:00:00 2001 From: fazeelghafoor Date: Thu, 1 Aug 2024 23:27:43 +0500 Subject: [PATCH 2/7] add managment command deduplicate_tags for removing case_insensitive duplicate tags --- .../management/commands/deduplicate_tags.py | 57 ++++++++++++++++ tests/test_deduplicate_tags.py | 66 +++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 taggit/management/commands/deduplicate_tags.py create mode 100644 tests/test_deduplicate_tags.py diff --git a/taggit/management/commands/deduplicate_tags.py b/taggit/management/commands/deduplicate_tags.py new file mode 100644 index 00000000..0644bac8 --- /dev/null +++ b/taggit/management/commands/deduplicate_tags.py @@ -0,0 +1,57 @@ +from collections import defaultdict +from django.core.management.base import BaseCommand +from django.conf import settings +from django.db import transaction +from taggit.models import Tag, TaggedItem + + +class Command(BaseCommand): + help = "Identify and remove duplicate tags based on case insensitivity" + + def handle(self, *args, **kwargs): + if not getattr(settings, "TAGGIT_CASE_INSENSITIVE", False): + self.stdout.write( + self.style.ERROR("TAGGIT_CASE_INSENSITIVE is not enabled.") + ) + return + + tags = Tag.objects.all() + tag_dict = {} + tagged_items_to_update = defaultdict(list) + + for tag in tags: + lower_name = tag.name.lower() + if lower_name in tag_dict: + existing_tag = tag_dict[lower_name] + self._collect_tagged_items(tag, existing_tag, tagged_items_to_update) + tag.delete() + else: + tag_dict[lower_name] = tag + + self._remove_duplicates_and_update(tagged_items_to_update) + self.stdout.write(self.style.SUCCESS("Tag deduplication complete.")) + + def _collect_tagged_items(self, tag, existing_tag, tagged_items_to_update): + for item in TaggedItem.objects.filter(tag=tag): + tagged_items_to_update[(item.content_type_id, item.object_id)].append( + existing_tag.id + ) + + def _remove_duplicates_and_update(self, tagged_items_to_update): + with transaction.atomic(): + for (content_type_id, object_id), tag_ids in tagged_items_to_update.items(): + unique_tag_ids = set(tag_ids) + if len(unique_tag_ids) > 1: + first_tag_id = unique_tag_ids.pop() + for duplicate_tag_id in unique_tag_ids: + TaggedItem.objects.filter( + content_type_id=content_type_id, + object_id=object_id, + tag_id=duplicate_tag_id, + ).delete() + + TaggedItem.objects.filter( + content_type_id=content_type_id, + object_id=object_id, + tag_id=first_tag_id, + ).update(tag_id=first_tag_id) diff --git a/tests/test_deduplicate_tags.py b/tests/test_deduplicate_tags.py new file mode 100644 index 00000000..64d7e9bb --- /dev/null +++ b/tests/test_deduplicate_tags.py @@ -0,0 +1,66 @@ +from io import StringIO +from django.core.management import call_command +from django.test import TestCase +from django.conf import settings +from taggit.models import Tag, TaggedItem +from tests.models import Food, HousePet + + +class DeduplicateTagsTests(TestCase): + def setUp(self): + settings.TAGGIT_CASE_INSENSITIVE = True + + self.tag1 = Tag.objects.create(name="Python") + self.tag2 = Tag.objects.create(name="python") + self.tag3 = Tag.objects.create(name="PYTHON") + + self.food_item = Food.objects.create(name="Apple") + self.pet_item = HousePet.objects.create(name="Fido") + + self.food_item.tags.add(self.tag1) + self.pet_item.tags.add(self.tag2) + + def test_deduplicate_tags(self): + self.assertEqual(Tag.objects.count(), 3) + self.assertEqual(TaggedItem.objects.count(), 2) + + out = StringIO() + call_command("deduplicate_tags", stdout=out) + + self.assertEqual(Tag.objects.count(), 1) + self.assertEqual(TaggedItem.objects.count(), 2) + + self.assertTrue(Tag.objects.filter(name__iexact="python").exists()) + self.assertEqual( + TaggedItem.objects.filter(tag__name__iexact="python").count(), 2 + ) + + self.assertIn("Tag deduplication complete.", out.getvalue()) + + def test_no_duplicates(self): + self.assertEqual(Tag.objects.count(), 3) + self.assertEqual(TaggedItem.objects.count(), 2) + + out = StringIO() + call_command("deduplicate_tags", stdout=out) + + self.assertEqual(Tag.objects.count(), 1) + self.assertEqual(TaggedItem.objects.count(), 2) + + self.assertTrue(Tag.objects.filter(name__iexact="python").exists()) + self.assertEqual( + TaggedItem.objects.filter(tag__name__iexact="python").count(), 2 + ) + + self.assertIn("Tag deduplication complete.", out.getvalue()) + + def test_taggit_case_insensitive_not_enabled(self): + settings.TAGGIT_CASE_INSENSITIVE = False + + out = StringIO() + call_command("deduplicate_tags", stdout=out) + + self.assertIn("TAGGIT_CASE_INSENSITIVE is not enabled.", out.getvalue()) + + self.assertEqual(Tag.objects.count(), 3) + self.assertEqual(TaggedItem.objects.count(), 2) From 7d49beef87bc70ce4d66183af2e8f3d8bca8ab23 Mon Sep 17 00:00:00 2001 From: fazeelghafoor Date: Thu, 1 Aug 2024 23:27:53 +0500 Subject: [PATCH 3/7] update changelog --- CHANGELOG.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cd7b6cd0..36d2c1dc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,9 @@ Changelog (Unreleased) ~~~~~~~~~~~~ * Added a management command (``remove_orphaned_tags``) to remove orphaned tags +* Fixed MultipleObjectsReturned error when setting tags with case-insensitive names by returning first tag aftre ``TAGGIT_CASE_INSENSITIVE`` is set to ``True``. +* Added ``deduplicate_tags`` management command to remove duplicate tags based on case insensitivity. This feature is enabled when ``TAGGIT_CASE_INSENSITIVE`` is set to ``True`` in the settings. + 6.0.0 (2024-07-27) ~~~~~~~~~~~~~~~~~~ From 150c025998cee9b0a3696fb36bdd080361110bb0 Mon Sep 17 00:00:00 2001 From: Raphael Gaschignard Date: Sat, 28 Sep 2024 16:05:08 +1000 Subject: [PATCH 4/7] Make sure ordering is always by pk for handling multiples --- CHANGELOG.rst | 6 +++--- taggit/managers.py | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 36d2c1dc..5fa99d08 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,9 +3,9 @@ Changelog (Unreleased) ~~~~~~~~~~~~ -* Added a management command (``remove_orphaned_tags``) to remove orphaned tags -* Fixed MultipleObjectsReturned error when setting tags with case-insensitive names by returning first tag aftre ``TAGGIT_CASE_INSENSITIVE`` is set to ``True``. -* Added ``deduplicate_tags`` management command to remove duplicate tags based on case insensitivity. This feature is enabled when ``TAGGIT_CASE_INSENSITIVE`` is set to ``True`` in the settings. +* Add a management command (``remove_orphaned_tags``) to remove orphaned tags +* Add a fallback for when multiple tags are found in case-insensitivity mode (the earliest by PK is returned) +* Add a ``deduplicate_tags`` management command to remove duplicate tags based on case insensitivity. This feature is enabled when ``TAGGIT_CASE_INSENSITIVE`` is set to ``True`` in the settings. 6.0.0 (2024-07-27) diff --git a/taggit/managers.py b/taggit/managers.py index 377b27c5..cf8e3407 100644 --- a/taggit/managers.py +++ b/taggit/managers.py @@ -244,7 +244,11 @@ def _to_tag_model_instances(self, tags, tag_kwargs): except self.through.tag_model().DoesNotExist: tags_to_create.append(name) except MultipleObjectsReturned: - tag = manager.filter(name__iexact=name, **tag_kwargs).first() + tag = ( + manager.filter(name__iexact=name, **tag_kwargs) + .order_by("pk") + .first() + ) existing_tags_for_str[name] = tag else: # Django is smart enough to not actually query if tag_strs is empty From 0e109630f350eeffa8c37eddcd9e9c31e13b2cf2 Mon Sep 17 00:00:00 2001 From: Raphael Gaschignard Date: Sat, 28 Sep 2024 16:11:23 +1000 Subject: [PATCH 5/7] sort imports --- taggit/management/commands/deduplicate_tags.py | 4 +++- tests/test_deduplicate_tags.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/taggit/management/commands/deduplicate_tags.py b/taggit/management/commands/deduplicate_tags.py index 0644bac8..647d7873 100644 --- a/taggit/management/commands/deduplicate_tags.py +++ b/taggit/management/commands/deduplicate_tags.py @@ -1,7 +1,9 @@ from collections import defaultdict -from django.core.management.base import BaseCommand + from django.conf import settings +from django.core.management.base import BaseCommand from django.db import transaction + from taggit.models import Tag, TaggedItem diff --git a/tests/test_deduplicate_tags.py b/tests/test_deduplicate_tags.py index 64d7e9bb..b0f63feb 100644 --- a/tests/test_deduplicate_tags.py +++ b/tests/test_deduplicate_tags.py @@ -1,7 +1,9 @@ from io import StringIO + +from django.conf import settings from django.core.management import call_command from django.test import TestCase -from django.conf import settings + from taggit.models import Tag, TaggedItem from tests.models import Food, HousePet From e851a84be4f5c3231d06a96750577820f9d5ae2b Mon Sep 17 00:00:00 2001 From: Raphael Gaschignard Date: Sat, 28 Sep 2024 16:33:13 +1000 Subject: [PATCH 6/7] Simplify deduplication management command --- .../management/commands/deduplicate_tags.py | 34 ++++++++++++++++--- tests/test_deduplicate_tags.py | 7 ++-- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/taggit/management/commands/deduplicate_tags.py b/taggit/management/commands/deduplicate_tags.py index 647d7873..2f1f512a 100644 --- a/taggit/management/commands/deduplicate_tags.py +++ b/taggit/management/commands/deduplicate_tags.py @@ -19,20 +19,46 @@ def handle(self, *args, **kwargs): tags = Tag.objects.all() tag_dict = {} - tagged_items_to_update = defaultdict(list) for tag in tags: lower_name = tag.name.lower() if lower_name in tag_dict: existing_tag = tag_dict[lower_name] - self._collect_tagged_items(tag, existing_tag, tagged_items_to_update) - tag.delete() + self._deduplicate_tags(existing_tag=existing_tag, tag_to_remove=tag) else: tag_dict[lower_name] = tag - self._remove_duplicates_and_update(tagged_items_to_update) self.stdout.write(self.style.SUCCESS("Tag deduplication complete.")) + @transaction.atomic + def _deduplicate_tags(self, existing_tag, tag_to_remove): + """ + Remove a tag by merging it into an existing tag + """ + # If this ends up very slow for you, please file a ticket! + # This isn't trying to be performant, in order to keep the code simple. + for item in TaggedItem.objects.filter(tag=tag_to_remove): + # if we already have the same association on the model + # (via the existing tag), then we can just remove the + # tagged item. + tag_exists_other = TaggedItem.objects.filter( + tag=existing_tag, + content_type_id=item.content_type_id, + object_id=item.object_id, + ).exists() + if tag_exists_other: + item.delete() + else: + item.tag = existing_tag + item.save() + + # this should never trigger, but can never be too sure + assert not TaggedItem.objects.filter( + tag=tag_to_remove + ).exists(), "Tags were not all cleaned up!" + + tag_to_remove.delete() + def _collect_tagged_items(self, tag, existing_tag, tagged_items_to_update): for item in TaggedItem.objects.filter(tag=tag): tagged_items_to_update[(item.content_type_id, item.object_id)].append( diff --git a/tests/test_deduplicate_tags.py b/tests/test_deduplicate_tags.py index b0f63feb..b7aa782f 100644 --- a/tests/test_deduplicate_tags.py +++ b/tests/test_deduplicate_tags.py @@ -21,10 +21,11 @@ def setUp(self): self.food_item.tags.add(self.tag1) self.pet_item.tags.add(self.tag2) + self.pet_item.tags.add(self.tag3) def test_deduplicate_tags(self): self.assertEqual(Tag.objects.count(), 3) - self.assertEqual(TaggedItem.objects.count(), 2) + self.assertEqual(TaggedItem.objects.count(), 3) out = StringIO() call_command("deduplicate_tags", stdout=out) @@ -41,7 +42,7 @@ def test_deduplicate_tags(self): def test_no_duplicates(self): self.assertEqual(Tag.objects.count(), 3) - self.assertEqual(TaggedItem.objects.count(), 2) + self.assertEqual(TaggedItem.objects.count(), 3) out = StringIO() call_command("deduplicate_tags", stdout=out) @@ -65,4 +66,4 @@ def test_taggit_case_insensitive_not_enabled(self): self.assertIn("TAGGIT_CASE_INSENSITIVE is not enabled.", out.getvalue()) self.assertEqual(Tag.objects.count(), 3) - self.assertEqual(TaggedItem.objects.count(), 2) + self.assertEqual(TaggedItem.objects.count(), 3) From f4effb2c67e33e3e5309c578cb7926448fa4dae9 Mon Sep 17 00:00:00 2001 From: Raphael Gaschignard Date: Sat, 28 Sep 2024 16:39:18 +1000 Subject: [PATCH 7/7] Remove unused import --- taggit/management/commands/deduplicate_tags.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/taggit/management/commands/deduplicate_tags.py b/taggit/management/commands/deduplicate_tags.py index 2f1f512a..a745634b 100644 --- a/taggit/management/commands/deduplicate_tags.py +++ b/taggit/management/commands/deduplicate_tags.py @@ -1,5 +1,3 @@ -from collections import defaultdict - from django.conf import settings from django.core.management.base import BaseCommand from django.db import transaction