Merge branch 'release/5.7.0'

CentreForDigitalHumanities · Jun 5, 2024 · 4cae955 · 4cae955
2 parents 1518820 + 4b878df
commit 4cae955
Show file tree

Hide file tree

Showing 175 changed files with 2,582 additions and 1,487 deletions.
diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml
@@ -0,0 +1,27 @@
+# This workflow will run backend tests on the Python version defined in the Dockerfiles
+
+name: Backend unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'develop'
+      - 'master'
+      - 'feature/**'
+      - 'bugfix/**'
+      - 'hotfix/**'
+      - 'release/**'
+      - 'dependabot/**'
+    paths-ignore:
+      - 'frontend/**'
+      - '**.md'
+
+jobs:
+  backend-test:
+    name: Test Backend
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Run backend tests
+      run: sudo mkdir -p /ci-data && sudo docker-compose --env-file .env-ci run backend pytest
diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml
@@ -0,0 +1,27 @@
+# This workflow will run frontend tests on the Node version defined in the Dockerfiles
+
+name: Frontend unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'develop'
+      - 'master'
+      - 'feature/**'
+      - 'bugfix/**'
+      - 'hotfix/**'
+      - 'release/**'
+      - 'dependabot/**'
+    paths-ignore:
+      - 'backend/**'
+      - '**.md'
+
+jobs:
+  frontend-test:
+    name: Test Frontend
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Run frontend tests
+      run: sudo docker-compose --env-file .env-ci run frontend yarn test
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,25 @@
+# This action will update the CITATION.cff file for new release or hotfix branches
+
+name: Release
+
+on:
+  push:
+    branches:
+      - 'release/**'
+      - 'hotfix/**'
+
+jobs:
+  citation-update:
+    name: Update CITATION.cff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Autoformat CITATION.cff
+        run: |
+          version=`grep -o '\d\+\.\d\+\.\d\+' package.json`
+          today=`date +"%Y-%m-%d"`
+          sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff
+          sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff
+          bash ./update-citation.sh
+          git commit -a -m "update version and date in CITATION.cff"
+
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
diff --git a/CITATION.cff b/CITATION.cff
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.6.2
-date-released: '2024-05-06'
+version: 5.7.0
+date-released: '2024-06-5'
diff --git a/DockerfileElastic b/DockerfileElastic
@@ -0,0 +1,3 @@
+FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2
+
+RUN bin/elasticsearch-plugin install mapper-annotated-text
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ For corpora included in I-analyzer, the backend includes a definition file that
 
 ## Usage
 
-If you are interested in using I-analyzer, the most straightforward way to get started is to make an account at [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament  (not publicly accessible)](https://people-and-parliament.hum.uu.nl/).
+If you are interested in using I-analyzer, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
 
 I-analyzer does not have an "upload data" option (yet!). If you are interested in using I-analyzer as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
 

diff --git a/backend/addcorpus/conftest.py b/backend/addcorpus/conftest.py
@@ -1,31 +1,14 @@
 import pytest
-import os
 from django.contrib.auth.models import Group
 from addcorpus.models import Corpus
 
 @pytest.fixture()
-def group_with_access(db, mock_corpus):
+def group_with_access(db, basic_mock_corpus):
     '''Create a group with access to the mock corpus'''
     group = Group.objects.create(name='nice-users')
-    corpus = Corpus.objects.get(name=mock_corpus)
+    corpus = Corpus.objects.get(name=basic_mock_corpus)
     corpus.groups.add(group)
     corpus.save()
     yield group
     group.delete()
 
-here = os.path.abspath(os.path.dirname(__file__))
-
-@pytest.fixture()
-def mock_corpus():
-    return 'mock-csv-corpus'
-
-
-@pytest.fixture()
-def basic_corpus():
-    corpus_name = 'mock-basic-corpus'
-    basic_group = Group.objects.create(name='basic')
-    corpus = Corpus.objects.get(name=corpus_name)
-    corpus.groups.add(basic_group)
-    yield corpus_name
-    corpus.groups.remove(basic_group)
-    basic_group.delete()
diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -1,5 +1,9 @@
+from typing import Dict
 from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available
 
+def primary_mapping_type(es_mapping: Dict) -> str:
+    return es_mapping.get('type', None)
+
 def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
     '''
     Mapping for the main content field. Options:

diff --git a/backend/addcorpus/json_corpora/conftest.py b/backend/addcorpus/json_corpora/conftest.py
@@ -0,0 +1,147 @@
+import pytest
+
+@pytest.fixture()
+def content_field_json():
+    return {
+        'name': 'content',
+        'display_name': 'Content',
+        'description': 'Bla bla bla',
+        'type': 'text_content',
+        'language': 'en',
+        'options': {
+            'search': True,
+            'filter': 'none',
+            'preview': True,
+            'visualize': True,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'content'}
+    }
+
+@pytest.fixture()
+def keyword_field_json():
+    return {
+        'name': 'author',
+        'display_name': 'Author',
+        'description': 'Author of the text',
+        'type': 'text_metadata',
+        'options': {
+            'search': True,
+            'filter': 'show',
+            'preview': True,
+            'visualize': True,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'author'}
+    }
+
+@pytest.fixture()
+def int_field_json():
+    return {
+        'name': 'year',
+        'display_name': 'Year',
+        'description': 'Year in which the text was written',
+        'type': 'integer',
+        'options': {
+            'search': False,
+            'filter': 'show',
+            'preview': False,
+            'visualize': True,
+            'sort': True,
+            'hidden': False
+        },
+        'extract': {'column': 'year'}
+    }
+
+@pytest.fixture()
+def float_field_json():
+    return {
+        'name': 'ocr_confidence',
+        'display_name': 'OCR confidence',
+        'description': 'Confidence level of optical character recognition output',
+        'type': 'float',
+        'options': {
+            'search': False,
+            'filter': 'hide',
+            'preview': False,
+            'visualize': False,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'ocr'}
+    }
+
+@pytest.fixture()
+def date_field_json():
+    return {
+        'name': 'date',
+        'display_name': 'Date',
+        'description': 'Date on which the text was written',
+        'type': 'date',
+        'options': {
+            'search': False,
+            'filter': 'show',
+            'preview': True,
+            'visualize': True,
+            'sort': True,
+            'hidden': False
+        },
+        'extract': {'column': 'date'}
+    }
+
+@pytest.fixture()
+def boolean_field_json():
+    return {
+        'name': 'author_known',
+        'display_name': 'Author known',
+        'description': 'Whether the author of the text is known',
+        'type': 'boolean',
+        'options': {
+            'search': False,
+            'filter': 'show',
+            'preview': False,
+            'visualize': True,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'author_known'}
+    }
+
+@pytest.fixture()
+def geo_field_json():
+    return {
+        'name': 'location',
+        'display_name': 'Location',
+        'description': 'Location where the text was published',
+        'type': 'geo_point',
+        'options': {
+            'search': False,
+            'filter': 'none',
+            'preview': False,
+            'visualize': False,
+            'sort': False,
+            'hidden': False
+        },
+        'extract': {'column': 'location'}
+    }
+
+@pytest.fixture(
+    params=['content', 'keyword', 'int', 'float', 'date', 'boolean', 'geo']
+)
+def any_field_json(
+    request, content_field_json, keyword_field_json, int_field_json, float_field_json,
+    date_field_json, boolean_field_json, geo_field_json
+):
+    field_type = request.param
+    funcs = {
+        'content': content_field_json,
+        'keyword': keyword_field_json,
+        'int': int_field_json,
+        'float': float_field_json,
+        'date': date_field_json,
+        'boolean': boolean_field_json,
+        'geo': geo_field_json,
+    }
+    return funcs[field_type]
diff --git a/backend/addcorpus/json_corpora/constants.py b/backend/addcorpus/json_corpora/constants.py
@@ -0,0 +1,2 @@
+DEFAULT_CSV_DELIMITER = ','
+DATE_FORMAT = '%Y-%m-%d'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2

		RUN bin/elasticsearch-plugin install mapper-annotated-text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		DEFAULT_CSV_DELIMITER = ','
		DATE_FORMAT = '%Y-%m-%d'