Skip to content

Commit 532cfd6

Browse files
committed
Merge branch 'release/5.17.0'
2 parents fa0a872 + 7e65872 commit 532cfd6

File tree

98 files changed

+2072
-522
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+2072
-522
lines changed

.github/workflows/scheduled-build-and-push.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,20 @@ jobs:
2121
registry: ghcr.io
2222
username: ${{ github.actor }}
2323
password: ${{ secrets.GITHUB_TOKEN }}
24-
- name: Build frontend image, using cache from Github registry
24+
- name: Build frontend image
2525
uses: docker/build-push-action@v6
2626
with:
2727
context: frontend/.
2828
file: frontend/Dockerfile.base
2929
push: true
3030
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-frontend:latest
31-
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-frontend:latest
32-
cache-to: type=inline
33-
- name: Build backend image, using cache from Github registry
31+
platforms: linux/amd64,linux/arm64
32+
- name: Build backend image
3433
uses: docker/build-push-action@v6
3534
with:
3635
context: backend/.
3736
file: backend/Dockerfile.base
3837
push: true
3938
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
40-
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
41-
cache-to: type=inline
39+
platforms: linux/amd64,linux/arm64
40+

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ keywords:
3535
- elasticsearch
3636
- natural language processing
3737
license: MIT
38-
version: 5.16.0
39-
date-released: '2025-01-22'
38+
version: 5.17.0
39+
date-released: '2025-02-13'

backend/addcorpus/exceptions.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from addcorpus.models import Corpus
2+
3+
4+
class PythonDefinitionRequired(Exception):
5+
'''
6+
Exception that can be raised when attempting to use functionality only applicable for
7+
Python corpora, on a corpus that does not have a Python definition.
8+
'''
9+
10+
def __init__(self, corpus: Corpus, message: str, *args):
11+
self.corpus = corpus
12+
self.message = message
13+
super().__init__(*args)
14+
15+
16+
def __str__(self):
17+
return f'{self.message} (corpus: {self.corpus})'
18+
19+
class NoPythonDefinitionAllowed(Exception):
20+
'''
21+
Exception that can be raised when attempting to use functionality only applicable for
22+
database-only corpora, on a corpus with a Python definition.
23+
'''
24+
25+
def __init__(self, corpus: Corpus, message: str, *args):
26+
self.corpus = corpus
27+
self.message = message
28+
super().__init__(*args)
29+
30+
31+
def __str__(self):
32+
return f'{self.message} (corpus: {self.corpus})'
33+
Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1 @@
1-
from datetime import date
2-
3-
41
DEFAULT_CSV_DELIMITER = ','
5-
DATE_FORMAT = '%Y-%m-%d'
6-
7-
DEFAULT_MIN_DATE = date(1800, 1, 1)
8-
DEFAULT_MAX_DATE = date.today()

backend/addcorpus/json_corpora/export_json.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Dict
2-
from datetime import date
32
from addcorpus.models import Corpus, CorpusConfiguration, Field
4-
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT
3+
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER
54
from addcorpus.es_mappings import primary_mapping_type
65

76
def export_json_corpus(corpus: Corpus) -> Dict:
@@ -24,13 +23,11 @@ def export_corpus_meta(configuration: CorpusConfiguration) -> Dict:
2423
'description': configuration.description,
2524
'languages': configuration.languages,
2625
'date_range': {
27-
'min': export_date(configuration.min_date),
28-
'max': export_date(configuration.max_date),
26+
'min': configuration.min_year,
27+
'max': configuration.max_year,
2928
}
3029
}
3130

32-
def export_date(date: date):
33-
return date.strftime(DATE_FORMAT)
3431

3532
def export_corpus_source_data(configuration: CorpusConfiguration) -> Dict:
3633
data = {

backend/addcorpus/json_corpora/import_json.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
from typing import List, Dict, Iterable, Optional
2-
from datetime import date, datetime
3-
1+
from typing import List, Dict, Iterable
42

53
from addcorpus.models import Field
64
from addcorpus.json_corpora.utils import get_path
75
from addcorpus import es_mappings
86
from addcorpus.constants import VisualizationType
97
from django.conf import settings
10-
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT, DEFAULT_MAX_DATE, DEFAULT_MIN_DATE
8+
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER
119

1210
def import_json_corpus(data: Dict) -> Dict:
1311
name = get_path(data, 'name')
@@ -31,10 +29,8 @@ def _parse_configuration(data: Dict) -> Dict:
3129
'category': get_path(data, 'meta', 'category'),
3230
'es_index': create_index_name(get_path(data, 'name')),
3331
'languages': get_path(data, 'meta', 'languages'),
34-
'min_date': _parse_date(
35-
get_path(data, 'meta', 'date_range', 'min'), DEFAULT_MIN_DATE),
36-
'max_date': _parse_date(
37-
get_path(data, 'meta', 'date_range', 'max'), DEFAULT_MAX_DATE),
32+
'min_year': get_path(data, 'meta', 'date_range', 'min'),
33+
'max_year': get_path(data, 'meta', 'date_range', 'max'),
3834
'default_sort': get_path(
3935
data, 'options', 'default_sort') or {},
4036
'language_field': get_path(
@@ -47,12 +43,6 @@ def _parse_configuration(data: Dict) -> Dict:
4743
}
4844

4945

50-
def _parse_date(date: Optional[str], fallback: Optional[date]):
51-
if not date:
52-
return fallback
53-
return datetime.strptime(date, DATE_FORMAT).date()
54-
55-
5646
def _import_fields(data: Dict) -> List[Dict]:
5747
fields_data = get_path(data, 'fields')
5848

backend/addcorpus/json_corpora/tests/test_import.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from datetime import date
21
from addcorpus.json_corpora.import_json import _parse_field
32
from addcorpus.models import Field, Corpus
43
from addcorpus.serializers import CorpusJSONDefinitionSerializer
54
from addcorpus.models import Corpus, CorpusConfiguration
5+
from addcorpus.json_corpora.export_json import export_json_corpus
66

77
def test_json_corpus_import(db, json_mock_corpus, json_corpus_definition):
88
json_mock_corpus.delete()
@@ -24,8 +24,8 @@ def test_json_corpus_import(db, json_mock_corpus, json_corpus_definition):
2424
assert config.description == 'Example corpus'
2525
assert config.languages == ['en']
2626
assert config.category == 'book'
27-
assert config.min_date == date(1500, 1, 1)
28-
assert config.max_date == date(1700, 12, 31)
27+
assert config.min_year == 1500
28+
assert config.max_year == 1700
2929
assert config.source_data_delimiter == ','
3030
assert config.es_index == 'test-example'
3131

@@ -76,6 +76,20 @@ def test_serializer_update(db, json_corpus_definition, json_mock_corpus: Corpus)
7676
serializer.update(json_mock_corpus, serializer.validated_data)
7777
assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 1
7878

79+
def test_serializer_update_field_order(db, json_corpus_definition, json_mock_corpus: Corpus):
80+
# send corpus with reverse field order
81+
data = {
82+
'definition': json_corpus_definition,
83+
'active': True,
84+
}
85+
data['definition']['fields'] = list(reversed(data['definition']['fields']))
86+
serializer = CorpusJSONDefinitionSerializer(data=data)
87+
assert serializer.is_valid()
88+
serializer.update(json_mock_corpus, serializer.validated_data)
89+
90+
json_mock_corpus.refresh_from_db()
91+
assert export_json_corpus(json_mock_corpus) == data['definition']
92+
7993

8094
def test_parse_content_field(content_field_json):
8195
data = _parse_field(content_field_json)
@@ -195,4 +209,3 @@ def test_parse_geo_field(geo_field_json):
195209
assert field.hidden == False
196210
assert field.sortable == False
197211
assert field.searchable == False
198-
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import addcorpus.models
2+
from django.db import migrations, models
3+
from django.apps import AppConfig
4+
from datetime import date
5+
6+
7+
def fill_min_max_years_from_date(apps: AppConfig, schema_editor):
8+
CorpusConfiguration = apps.get_model('addcorpus', 'CorpusConfiguration')
9+
for config in CorpusConfiguration.objects.all():
10+
config.min_year = config.min_date.year
11+
config.max_year = config.max_date.year
12+
config.save()
13+
14+
15+
def fill_min_max_date_from_year(apps: AppConfig, schema_editor):
16+
CorpusConfiguration = apps.get_model('addcorpus', 'CorpusConfiguration')
17+
for config in CorpusConfiguration.objects.all():
18+
config.min_date = date(year=config.min_year, month=1, day=1)
19+
config.max_date = date(year=config.max_year, month=12, day=31)
20+
config.save()
21+
22+
23+
class Migration(migrations.Migration):
24+
25+
replaces = [('addcorpus', '0028_corpusconfiguration_max_year_and_more'), ('addcorpus', '0029_fill_minyear_maxyear'), ('addcorpus', '0030_remove_corpusconfiguration_max_date_and_more'), ('addcorpus', '0031_alter_corpusconfiguration_max_year_and_more')]
26+
27+
dependencies = [
28+
('addcorpus', '0027_alter_corpusconfiguration_category_and_more'),
29+
]
30+
31+
operations = [
32+
migrations.AddField(
33+
model_name='corpusconfiguration',
34+
name='max_year',
35+
field=models.IntegerField(default=addcorpus.models.default_max_year, help_text='latest year for the data in the corpus'),
36+
),
37+
migrations.AddField(
38+
model_name='corpusconfiguration',
39+
name='min_year',
40+
field=models.IntegerField(default=1800, help_text='earliest year for the data in the corpus'),
41+
preserve_default=False,
42+
),
43+
migrations.AlterField(
44+
model_name='corpusconfiguration',
45+
name='min_date',
46+
field=models.DateField(null=True, help_text='earliest date for the data in the corpus'),
47+
),
48+
migrations.AlterField(
49+
model_name='corpusconfiguration',
50+
name='max_date',
51+
field=models.DateField(null=True, help_text='latest date for the data in the corpus'),
52+
),
53+
migrations.RunPython(
54+
code=fill_min_max_years_from_date,
55+
reverse_code=fill_min_max_date_from_year,
56+
),
57+
migrations.RemoveField(
58+
model_name='corpusconfiguration',
59+
name='max_date',
60+
),
61+
migrations.RemoveField(
62+
model_name='corpusconfiguration',
63+
name='min_date',
64+
),
65+
]
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.2.17 on 2025-02-13 16:30
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('addcorpus', '0028_corpusconfiguration_minyear_maxyear'),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name='corpusconfiguration',
15+
name='min_year',
16+
field=models.IntegerField(default=1800, help_text='earliest year for the data in the corpus'),
17+
),
18+
]

backend/addcorpus/models.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import warnings
3+
from datetime import datetime
34

45
from django.contrib import admin
56
from django.contrib.auth.models import Group
@@ -36,6 +37,11 @@
3637
MAX_LENGTH_NAME = 126
3738
MAX_LENGTH_DESCRIPTION = 254
3839
MAX_LENGTH_TITLE = 256
40+
DEFAULT_MIN_YEAR = 1800
41+
42+
def default_max_year() -> int:
43+
return datetime.now().year
44+
3945

4046
class Corpus(models.Model):
4147
name = models.SlugField(
@@ -217,11 +223,13 @@ class CorpusConfiguration(models.Model):
217223
help_text='languages used in the content of the corpus (from most to least frequent)',
218224
blank=True,
219225
)
220-
min_date = models.DateField(
221-
help_text='earliest date for the data in the corpus',
226+
min_year = models.IntegerField(
227+
help_text='earliest year for the data in the corpus',
228+
default=DEFAULT_MIN_YEAR,
222229
)
223-
max_date = models.DateField(
224-
help_text='latest date for the data in the corpus',
230+
max_year = models.IntegerField(
231+
help_text='latest year for the data in the corpus',
232+
default=default_max_year,
225233
)
226234
scan_image_type = models.CharField(
227235
max_length=64,

backend/addcorpus/python_corpora/corpus.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
Module contains the base classes from which corpora can derive;
33
'''
44

5-
from typing import Optional, List, Dict
5+
from typing import Optional, List, Dict, Union
66
from ianalyzer_readers import extract
7-
from datetime import datetime
7+
from datetime import datetime, date
88
from os.path import isdir
99
import os
1010

@@ -49,16 +49,20 @@ def description(self):
4949
raise NotImplementedError('CorpusDefinition missing description')
5050

5151
@property
52-
def min_date(self):
52+
def min_date(self) -> Union[datetime, date, int]:
5353
'''
5454
Minimum timestamp for data files.
55+
56+
Can be a datetime, date, or integer (representing the year).
5557
'''
5658
raise NotImplementedError('CorpusDefinition missing min_date')
5759

5860
@property
59-
def max_date(self):
61+
def max_date(self) -> Union[datetime, date, int]:
6062
'''
6163
Maximum timestamp for data files.
64+
65+
Can be a datetime, date, or integer (representing the year).
6266
'''
6367
raise NotImplementedError('CorpusDefinition missing max_date')
6468

@@ -512,16 +516,26 @@ def f(metadata):
512516
return f
513517

514518

515-
def consolidate_start_end_years(start, end, min_date, max_date):
519+
def consolidate_start_end_years(
520+
start: Union[datetime, date, int],
521+
end: Union[datetime, date, int],
522+
min_date: datetime,
523+
max_date: datetime
524+
):
516525
''' given a start and end date provided by the user, make sure
517526
- that start is not before end
518527
- that start is not before min_date (corpus variable)
519528
- that end is not after max_date (corpus variable)
520529
'''
521530
if isinstance(start, int):
522531
start = datetime(year=start, month=1, day=1)
532+
elif isinstance(start, date):
533+
start = datetime(year=start.year, month=start.month, day=start.day)
523534
if isinstance(end, int):
524535
end = datetime(year=end, month=12, day=31)
536+
elif isinstance(end, date):
537+
end = datetime(year=end.year, month=end.month, day=end.day)
538+
525539
if start > end:
526540
tmp = start
527541
start = end

backend/addcorpus/python_corpora/filters.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
passed through to ElasticSearch.
44
'''
55

6-
from datetime import datetime
6+
from datetime import datetime, date
77
from addcorpus.constants import MappingType
88

99
class Filter(object):
@@ -24,7 +24,7 @@ def serialize(self):
2424
search_dict = {'name': name}
2525
for key, value in self.__dict__.items():
2626
if key == 'search_filter' or key != 'field':
27-
if type(value) == datetime:
27+
if isinstance(value, datetime) or isinstance(value, date):
2828
search_dict[key] = value.isoformat()
2929
else:
3030
search_dict[key] = value

0 commit comments

Comments
 (0)