Skip to content

Commit fa0a872

Browse files
committed
Merge branch 'release/5.16.0'
2 parents ffc81e5 + f63bea7 commit fa0a872

File tree

82 files changed

+1918
-499
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+1918
-499
lines changed

.github/workflows/scheduled-build-and-push.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,3 @@ jobs:
3939
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
4040
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
4141
cache-to: type=inline
42-
- name: Build Elasticsearch image, using cache from Github registry
43-
uses: docker/build-push-action@v6
44-
with:
45-
context: .
46-
file: DockerfileElastic
47-
push: true
48-
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
49-
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
50-
cache-to: type=inline

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ keywords:
3535
- elasticsearch
3636
- natural language processing
3737
license: MIT
38-
version: 5.15.0
39-
date-released: '2024-12-09'
38+
version: 5.16.0
39+
date-released: '2025-01-22'

DockerfileElastic

Lines changed: 0 additions & 3 deletions
This file was deleted.

backend/addcorpus/admin.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from django.contrib import admin, messages
2-
from .models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
2+
from .models import Corpus, CorpusConfiguration, CorpusDataFile, Field, CorpusDocumentationPage
33

44
def show_warning_message(request):
55
'''
@@ -14,6 +14,12 @@ def show_warning_message(request):
1414
)
1515

1616

17+
class InlineDatafileAdmin(admin.StackedInline):
18+
model = CorpusDataFile
19+
fields = ['file', 'is_sample']
20+
show_change_link = True,
21+
extra = 0
22+
1723
class CorpusAdmin(admin.ModelAdmin):
1824
readonly_fields = [
1925
'configuration', 'ready_to_index', 'ready_to_publish', 'date_created',
@@ -24,13 +30,15 @@ class CorpusAdmin(admin.ModelAdmin):
2430
]
2531
list_display = ['name', 'active']
2632
list_filter = ['groups', 'active']
33+
inlines = [InlineDatafileAdmin]
2734

2835
class InlineFieldAdmin(admin.StackedInline):
2936
model = Field
3037
fields = ['display_name', 'description']
3138
show_change_link = True
3239
extra = 0
3340

41+
3442
class CorpusConfigurationAdmin(admin.ModelAdmin):
3543
readonly_fields = ['corpus']
3644

backend/addcorpus/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ class MappingType(Enum):
2626
FLOAT = 'float'
2727
BOOLEAN = 'boolean'
2828
GEO_POINT = 'geo_point'
29-
ANNOTATED_TEXT = 'annotated_text'
3029

3130

3231
class VisualizationType(Enum):

backend/addcorpus/es_mappings.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,12 @@ def float_mapping():
9292
'type': 'float'
9393
}
9494

95-
9695
def bool_mapping():
9796
return {'type': 'boolean'}
9897

9998
def geo_mapping():
10099
return {'type': 'geo_point'}
101100

102101

103-
def annotated_text_mapping():
104-
return {'type': 'annotated_text'}
102+
def non_indexed_text_mapping():
103+
return {'type': 'text', 'index': False}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,8 @@
1+
from datetime import date
2+
3+
14
DEFAULT_CSV_DELIMITER = ','
25
DATE_FORMAT = '%Y-%m-%d'
6+
7+
DEFAULT_MIN_DATE = date(1800, 1, 1)
8+
DEFAULT_MAX_DATE = date.today()

backend/addcorpus/json_corpora/import_json.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
from typing import List, Dict, Iterable, Optional
2-
from datetime import datetime
2+
from datetime import date, datetime
33

44

5-
from addcorpus.models import Corpus, CorpusConfiguration, Field
5+
from addcorpus.models import Field
66
from addcorpus.json_corpora.utils import get_path
77
from addcorpus import es_mappings
88
from addcorpus.constants import VisualizationType
9-
from addcorpus.validation.publishing import _any_date_fields
109
from django.conf import settings
11-
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT
10+
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT, DEFAULT_MAX_DATE, DEFAULT_MIN_DATE
1211

1312
def import_json_corpus(data: Dict) -> Dict:
1413
name = get_path(data, 'name')
@@ -33,9 +32,9 @@ def _parse_configuration(data: Dict) -> Dict:
3332
'es_index': create_index_name(get_path(data, 'name')),
3433
'languages': get_path(data, 'meta', 'languages'),
3534
'min_date': _parse_date(
36-
get_path(data, 'meta', 'date_range', 'min')),
35+
get_path(data, 'meta', 'date_range', 'min'), DEFAULT_MIN_DATE),
3736
'max_date': _parse_date(
38-
get_path(data, 'meta', 'date_range', 'max')),
37+
get_path(data, 'meta', 'date_range', 'max'), DEFAULT_MAX_DATE),
3938
'default_sort': get_path(
4039
data, 'options', 'default_sort') or {},
4140
'language_field': get_path(
@@ -48,7 +47,9 @@ def _parse_configuration(data: Dict) -> Dict:
4847
}
4948

5049

51-
def _parse_date(date: str):
50+
def _parse_date(date: Optional[str], fallback: Optional[date]):
51+
if not date:
52+
return fallback
5253
return datetime.strptime(date, DATE_FORMAT).date()
5354

5455

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Generated by Django 4.2.17 on 2024-12-10 13:25
2+
3+
import addcorpus.models
4+
import addcorpus.validation.creation
5+
import django.contrib.postgres.fields
6+
from django.db import migrations, models
7+
import django.db.models.deletion
8+
9+
10+
class Migration(migrations.Migration):
11+
12+
dependencies = [
13+
('addcorpus', '0026_corpus_date_created'),
14+
]
15+
16+
operations = [
17+
migrations.AlterField(
18+
model_name='corpusconfiguration',
19+
name='category',
20+
field=models.CharField(blank=True, choices=[('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), ('finance', 'Financial reports'), ('ruling', 'Court rulings'), ('review', 'Online reviews'), ('inscription', 'Funerary inscriptions'), ('oration', 'Orations'), ('book', 'Books'), ('informative', 'Informative')], help_text='category/medium of documents in this dataset', max_length=64, null=True),
21+
),
22+
migrations.AlterField(
23+
model_name='corpusconfiguration',
24+
name='description',
25+
field=models.CharField(blank=True, help_text='short description of the corpus', max_length=254, null=True),
26+
),
27+
migrations.AlterField(
28+
model_name='corpusconfiguration',
29+
name='languages',
30+
field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(blank=True, max_length=8, validators=[addcorpus.validation.creation.validate_language_code]), blank=True, help_text='languages used in the content of the corpus (from most to least frequent)', size=None),
31+
),
32+
migrations.CreateModel(
33+
name='CorpusDataFile',
34+
fields=[
35+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
36+
('file', models.FileField(help_text='file containing corpus data', upload_to=addcorpus.models.CorpusDataFile.upload_path)),
37+
('is_sample', models.BooleanField(default=False, help_text='This file is used in creating the corpus definition, it may additonaly reflect (part of) the actual data.')),
38+
('created', models.DateTimeField(auto_now_add=True)),
39+
('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='addcorpus.corpus')),
40+
],
41+
),
42+
]

backend/addcorpus/models.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import warnings
23

34
from django.contrib import admin
@@ -178,10 +179,13 @@ class CorpusConfiguration(models.Model):
178179
max_length=64,
179180
choices=CATEGORIES,
180181
help_text='category/medium of documents in this dataset',
182+
blank=True,
183+
null=True
181184
)
182185
description = models.CharField(
183186
max_length=MAX_LENGTH_DESCRIPTION,
184187
blank=True,
188+
null=True,
185189
help_text='short description of the corpus',
186190
)
187191
document_context = models.JSONField(
@@ -211,6 +215,7 @@ class CorpusConfiguration(models.Model):
211215
blank=True,
212216
),
213217
help_text='languages used in the content of the corpus (from most to least frequent)',
218+
blank=True,
214219
)
215220
min_date = models.DateField(
216221
help_text='earliest date for the data in the corpus',
@@ -514,3 +519,18 @@ class Meta:
514519
name='unique_documentation_type_for_corpus'
515520
)
516521
]
522+
523+
524+
class CorpusDataFile(models.Model):
525+
def upload_path(self, filename):
526+
return os.path.join('corpus_datafiles', f'{self.corpus.pk}', filename)
527+
528+
corpus = models.ForeignKey(to=Corpus, on_delete=models.CASCADE)
529+
file = models.FileField(upload_to=upload_path,
530+
help_text='file containing corpus data')
531+
is_sample = models.BooleanField(
532+
default=False, help_text='This file is used in creating the corpus definition, it may additonaly reflect (part of) the actual data.')
533+
created = models.DateTimeField(auto_now_add=True)
534+
535+
def __str__(self):
536+
return f'{self.file.name}'

backend/addcorpus/permissions.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from rest_framework import permissions
22
from rest_framework.exceptions import NotFound
33
from rest_framework.request import Request
4-
from addcorpus.models import Corpus
4+
from addcorpus.models import Corpus, CorpusConfiguration
55

6-
def corpus_name_from_request(request):
6+
7+
def corpus_name_from_request(request: Request):
78
'''
89
Extract the corpus name from a request
910
'''
@@ -24,6 +25,11 @@ def corpus_name_from_request(request):
2425
return corpus
2526

2627

28+
def corpus_config_from_request(request: Request) -> CorpusConfiguration:
29+
corpus_name = corpus_name_from_request(request)
30+
return CorpusConfiguration.objects.get(corpus__name=corpus_name)
31+
32+
2733
class CanSearchCorpus(permissions.BasePermission):
2834
message = 'You do not have permission to access this corpus'
2935

backend/addcorpus/serializers.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
from rest_framework import serializers
21
from typing import Dict
32

4-
from addcorpus.models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
53
from addcorpus.constants import CATEGORIES
6-
from langcodes import Language, standardize_tag
74
from addcorpus.documentation import render_documentation_context
85
from addcorpus.json_corpora.export_json import export_json_corpus
96
from addcorpus.json_corpora.import_json import import_json_corpus
7+
from addcorpus.models import (Corpus, CorpusConfiguration, CorpusDataFile,
8+
CorpusDocumentationPage, Field)
9+
from django.core.files import File
10+
from langcodes import Language, standardize_tag
11+
from rest_framework import serializers
1012

1113

1214
class NonEmptyJSONField(serializers.JSONField):
@@ -197,9 +199,12 @@ def update(self, instance: Corpus, validated_data: Dict):
197199
configuration.save()
198200

199201
for field_data in fields_data:
200-
field, _ = Field.objects.get_or_create(
201-
corpus_configuration=configuration, name=field_data['name']
202-
)
202+
try:
203+
field = Field.objects.get(
204+
corpus_configuration=configuration, name=field_data['name'])
205+
except Field.DoesNotExist:
206+
field = Field(corpus_configuration=configuration,
207+
name=field_data['name'])
203208
for attr in field_data:
204209
setattr(field, attr, field_data[attr])
205210
field.save()
@@ -211,3 +216,19 @@ def update(self, instance: Corpus, validated_data: Dict):
211216
corpus.save()
212217

213218
return corpus
219+
220+
221+
class DataFileField(serializers.FileField):
222+
def to_representation(self, value: File) -> Dict:
223+
return value.name
224+
225+
def to_internal_value(self, data):
226+
return super().to_internal_value(data)
227+
228+
229+
class CorpusDataFileSerializer(serializers.ModelSerializer):
230+
file = DataFileField()
231+
232+
class Meta:
233+
model = CorpusDataFile
234+
fields = ('id', 'corpus', 'file', 'created', 'is_sample')
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
character,line,date-column,FLOAT COLUMN,int_column,bool column
2+
"HAMLET","Whither wilt thou lead me? Speak, I'll go no further.","1256-10-08","7.03","13","True"
3+
"GHOST","Mark me.","1435-10-07","4.74","89","True"
4+
"HAMLET","I will.","1463-07-16","5.55","29","True"
5+
"GHOST","My hour is almost come,","1634-08-09","3.04","100","True"
6+
"GHOST","When I to sulph'rous and tormenting flames","1982-09-01","6.73","34","True"
7+
"GHOST","Must render up myself.","1756-11-22","-0.58","-12","False"
8+
"HAMLET","Alas, poor ghost!","1200-09-05","9.38","6","False"
9+
"GHOST","Pity me not, but lend thy serious hearing","1633-11-18","8.84","83","False"
10+
"GHOST","To what I shall unfold.","1445-11-09","3.6","97","False"
11+
"HAMLET","Speak, I am bound to hear.","1984-08-12","-1.89","-4","False"
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
3+
from rest_framework.status import HTTP_200_OK, HTTP_201_CREATED
4+
5+
here = os.path.dirname(os.path.abspath(__file__))
6+
7+
8+
def test_csv_upload(admin_client, json_mock_corpus):
9+
fp = os.path.join(here, 'files', 'example.csv')
10+
11+
# Test file upload
12+
with open(fp, 'rb') as f:
13+
data = {'corpus': json_mock_corpus.pk, 'is_sample': True, 'file': f}
14+
res = admin_client.post('/api/corpus/datafiles/', data)
15+
assert res.status_code == HTTP_201_CREATED
16+
file_pk = res.data.get('id')
17+
18+
# Test file info
19+
info_res = admin_client.get(f'/api/corpus/datafiles/{file_pk}/info/')
20+
assert info_res.status_code == HTTP_200_OK
21+
assert info_res.data == {
22+
'character': 'text',
23+
'line': 'text',
24+
'date-column': 'date',
25+
'FLOAT COLUMN': 'float',
26+
'int_column': 'integer',
27+
'bool column': 'boolean'
28+
}

backend/addcorpus/tests/test_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from addcorpus.utils import is_date, is_date_col
2+
import pandas as pd
3+
4+
5+
def test_is_date():
6+
assert is_date('2024-01-01')
7+
assert not is_date(None)
8+
assert not is_date(5)
9+
assert not is_date('01-01-2024')
10+
11+
12+
def test_is_date_col():
13+
clean_date_series = pd.Series(['1800-01-01', '2024-01-01'])
14+
dirty_date_series = pd.concat([clean_date_series, pd.Series([None, ''])])
15+
empty_series = pd.Series([None, None])
16+
17+
assert is_date_col(clean_date_series)
18+
assert is_date_col(dirty_date_series)
19+
assert not is_date_col(empty_series)

backend/addcorpus/tests/test_validators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pytest
22
from addcorpus.models import Field
33
from addcorpus.es_mappings import (
4-
annotated_text_mapping,
4+
non_indexed_text_mapping,
55
date_mapping,
66
int_mapping,
77
text_mapping,
@@ -29,7 +29,7 @@ def test_validate_ner_slug():
2929
validate_ner_slug({}, "some:ner_inslug")
3030
with pytest.raises(ValidationError):
3131
validate_ner_slug(keyword_mapping(), "slug:ner")
32-
validate_ner_slug(annotated_text_mapping(), "slug:ner")
32+
validate_ner_slug(non_indexed_text_mapping(), "slug:ner")
3333
with pytest.raises(ValidationError):
3434
validate_ner_slug(date_mapping(), "slug:ner-kw")
3535
validate_ner_slug(keyword_mapping(), "slug:ner-kw")

0 commit comments

Comments
 (0)