From cdbb8305886cfee0c644daaa1e4786f75bfa3dc9 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 15:44:50 +0200 Subject: [PATCH 01/94] add class for aggregation --- .../multiple-choice-filter.component.ts | 3 +- frontend/src/app/models/aggregation.ts | 38 +++++++++++++++++++ frontend/src/app/models/elasticsearch.ts | 5 --- .../services/elastic-search.service.spec.ts | 8 ++-- .../app/services/elastic-search.service.ts | 20 ++-------- frontend/src/app/services/search.service.ts | 3 +- .../barchart/histogram.component.ts | 8 ++-- frontend/src/mock-data/search.ts | 5 ++- 8 files changed, 56 insertions(+), 34 deletions(-) create mode 100644 frontend/src/app/models/aggregation.ts diff --git a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts index 8bea15441..3eb1b8396 100644 --- a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts +++ b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts @@ -5,6 +5,7 @@ import * as _ from 'lodash'; import { BaseFilterComponent } from '../base-filter.component'; import { MultipleChoiceFilter, MultipleChoiceFilterOptions } from '../../models'; import { SearchService } from '../../services'; +import { TermsAggregator } from '../../models/aggregation'; @Component({ selector: 'ia-multiple-choice-filter', @@ -29,7 +30,7 @@ export class MultipleChoiceFilterComponent extends BaseFilterComponent { if (this.filter && this.queryModel) { const optionCount = (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count; - const aggregator = { name: this.filter.corpusField.name, size: optionCount }; + const aggregator = new TermsAggregator(this.filter.corpusField, optionCount); const queryModel = this.queryModel.clone(); queryModel.filterForField(this.filter.corpusField).deactivate(); this.searchService.aggregateSearch(queryModel.corpus, queryModel, [aggregator]).then( diff --git a/frontend/src/app/models/aggregation.ts b/frontend/src/app/models/aggregation.ts new file mode 100644 index 000000000..cd10eaead --- /dev/null +++ b/frontend/src/app/models/aggregation.ts @@ -0,0 +1,38 @@ +import { CorpusField } from './corpus'; + +interface EsTermsAggregator { + terms: { + field: string; + size?: number; + min_doc_count?: number; + }; +} + +export type EsAggregator = EsTermsAggregator; + +export abstract class Aggregator { + name: string; + + abstract toEsAggregator(): EsAggregator; +} + +export class TermsAggregator extends Aggregator { + constructor( + private field: CorpusField, + private maxSize?: number, + private minDocCount?: number + ) { + super(); + this.name = field.name; + } + + toEsAggregator(): EsTermsAggregator { + return { + terms: { + field: this.field.name, + size: this.maxSize, + min_doc_count: this.minDocCount, + } + }; + } +} diff --git a/frontend/src/app/models/elasticsearch.ts b/frontend/src/app/models/elasticsearch.ts index 83d9794de..346b943ed 100644 --- a/frontend/src/app/models/elasticsearch.ts +++ b/frontend/src/app/models/elasticsearch.ts @@ -78,8 +78,3 @@ export interface EsQuery { from?: number; size?: number; } - -export interface Aggregator { - name: string; - size: number; -} diff --git a/frontend/src/app/services/elastic-search.service.spec.ts b/frontend/src/app/services/elastic-search.service.spec.ts index b1a844ff0..bed867cd2 100644 --- a/frontend/src/app/services/elastic-search.service.spec.ts +++ b/frontend/src/app/services/elastic-search.service.spec.ts @@ -1,10 +1,11 @@ import { TestBed } from '@angular/core/testing'; import { HttpClientTestingModule, HttpTestingController } from '@angular/common/http/testing'; import { ElasticSearchService, SearchResponse } from './elastic-search.service'; -import { Aggregator, QueryModel } from '../models'; +import { QueryModel } from '../models'; import { mockCorpus, mockField, mockField2 } from '../../mock-data/corpus'; import { TagService } from './tag.service'; import { TagServiceMock } from '../../mock-data/tag'; +import { Aggregator, TermsAggregator } from '../models/aggregation'; const mockResponse: SearchResponse = { took: 4, @@ -103,10 +104,7 @@ describe('ElasticSearchService', () => { it('should make an aggregation request', async () => { const queryModel = new QueryModel(mockCorpus); - const aggregator: Aggregator = { - name: mockField.name, - size: 10, - }; + const aggregator = new TermsAggregator(mockField, 10); const response = service.aggregateSearch( mockCorpus, queryModel, diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts index f3d16dba6..58398592b 100644 --- a/frontend/src/app/services/elastic-search.service.ts +++ b/frontend/src/app/services/elastic-search.service.ts @@ -4,13 +4,14 @@ import { Injectable } from '@angular/core'; import { HttpClient, HttpParams } from '@angular/common/http'; import { FoundDocument, Corpus, QueryModel, SearchResults, - AggregateQueryFeedback, SearchHit, EsQuery, Aggregator + AggregateQueryFeedback, SearchHit, EsQuery } from '../models/index'; import * as _ from 'lodash'; import { TagService } from './tag.service'; import { APIQuery } from '../models/search-requests'; import { PageResultsParameters } from '../models/page-results'; import { resultsParamsToAPIQuery } from '../utils/es-query'; +import { Aggregator } from '../models/aggregation'; @Injectable() @@ -41,7 +42,7 @@ export class ElasticSearchService { ): Promise { const aggregations = {}; aggregators.forEach(d => { - aggregations[d.name] = this.makeAggregation(d.name, d.size, 1); + aggregations[d.name] = d.toEsAggregator(); }); const query = queryModel.toAPIQuery(); const withAggregation = _.set(query, 'es_query.aggs', aggregations); @@ -106,21 +107,6 @@ export class ElasticSearchService { return this.http.post(url, body).toPromise(); } - /** - * Construct the aggregator, based on kind of field - * Date fields are aggregated in year intervals - */ - private makeAggregation(aggregator: string, size?: number, min_doc_count?: number) { - const aggregation = { - terms: { - field: aggregator, - size, - min_doc_count - } - }; - return aggregation; - } - /** * Extract relevant information from dictionary returned by ES * diff --git a/frontend/src/app/services/search.service.ts b/frontend/src/app/services/search.service.ts index 99cace4d0..645463b60 100644 --- a/frontend/src/app/services/search.service.ts +++ b/frontend/src/app/services/search.service.ts @@ -7,6 +7,7 @@ import { AggregateQueryFeedback } from '../models/index'; import { PageResultsParameters } from '../models/page-results'; +import { Aggregator } from '../models/aggregation'; @Injectable() @@ -35,7 +36,7 @@ export class SearchService { public async aggregateSearch( corpus: Corpus, queryModel: QueryModel, - aggregators: any + aggregators: Aggregator[], ): Promise { return this.elasticSearchService.aggregateSearch( corpus, diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts index 288e02594..e08f28de7 100644 --- a/frontend/src/app/visualization/barchart/histogram.component.ts +++ b/frontend/src/app/visualization/barchart/histogram.component.ts @@ -10,6 +10,7 @@ import { RangeFilterOptions} from '../../models/index'; import { selectColor } from '../../utils/select-color'; import { BarchartDirective } from './barchart.directive'; +import { TermsAggregator } from '../../models/aggregation'; function formatXAxisLabel(value): string { const label = this.getLabelForValue(value); // from chartJS api @@ -39,10 +40,11 @@ export class HistogramComponent /** specify aggregator object based on visualised field; * used in document requests. */ - getAggregator() { + getAggregator(): TermsAggregator { let size = 0; + if (!this.visualizedField.filterOptions) { - return { name: this.visualizedField.name, size: 100 }; + return new TermsAggregator(this.visualizedField, 100); } const filterOptions = this.visualizedField.filterOptions; @@ -53,7 +55,7 @@ export class HistogramComponent (filterOptions as RangeFilterOptions).upper - (filterOptions as RangeFilterOptions).lower; } - return { name: this.visualizedField.name, size }; + return new TermsAggregator(this.visualizedField, size); } requestSeriesDocCounts(queryModel: QueryModel) { diff --git a/frontend/src/mock-data/search.ts b/frontend/src/mock-data/search.ts index 86e51fd5a..421327328 100644 --- a/frontend/src/mock-data/search.ts +++ b/frontend/src/mock-data/search.ts @@ -4,10 +4,11 @@ import { AggregateQueryFeedback, Corpus, CorpusField, FoundDocument, QueryModel, import { mockCorpus } from './corpus'; import { TagServiceMock } from './tag'; import { TagService } from '../app/services/tag.service'; +import { Aggregator } from '../app/models/aggregation'; export class SearchServiceMock { - public async aggregateSearch(corpus: Corpus, queryModel: QueryModel, aggregator: [{name: string}]): Promise { - const name = aggregator[0].name; + public async aggregateSearch(corpus: Corpus, queryModel: QueryModel, aggregators: Aggregator[]): Promise { + const name = aggregators[0].name; return { completed: false, aggregations: { From 6795886158648faa51645bb591ffa232499fce84 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 15:57:16 +0200 Subject: [PATCH 02/94] add DateHistogramAggregator class --- frontend/src/app/models/aggregation.ts | 28 ++++++++++++++++- .../app/services/elastic-search.service.ts | 31 ++----------------- frontend/src/app/services/search.service.ts | 14 --------- .../barchart/timeline.component.ts | 12 +++---- 4 files changed, 35 insertions(+), 50 deletions(-) diff --git a/frontend/src/app/models/aggregation.ts b/frontend/src/app/models/aggregation.ts index cd10eaead..e3a507411 100644 --- a/frontend/src/app/models/aggregation.ts +++ b/frontend/src/app/models/aggregation.ts @@ -8,7 +8,14 @@ interface EsTermsAggregator { }; } -export type EsAggregator = EsTermsAggregator; +interface EsDateHistogramAggregator { + date_histogram: { + field: string; + calendar_interval?: string; + }; +} + +export type EsAggregator = EsTermsAggregator | EsDateHistogramAggregator; export abstract class Aggregator { name: string; @@ -36,3 +43,22 @@ export class TermsAggregator extends Aggregator { }; } } + +export class DateHistogramAggregator extends Aggregator { + constructor( + private field: CorpusField, + private timeInterval?: string, + ) { + super(); + this.name = field.name; + } + + toEsAggregator(): EsDateHistogramAggregator { + return { + date_histogram: { + field: this.field.name, + calendar_interval: this.timeInterval, + } + }; + } +} diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts index 58398592b..685113389 100644 --- a/frontend/src/app/services/elastic-search.service.ts +++ b/frontend/src/app/services/elastic-search.service.ts @@ -1,10 +1,10 @@ /* eslint-disable @typescript-eslint/member-ordering */ /* eslint-disable @typescript-eslint/member-ordering */ import { Injectable } from '@angular/core'; -import { HttpClient, HttpParams } from '@angular/common/http'; +import { HttpClient } from '@angular/common/http'; import { FoundDocument, Corpus, QueryModel, SearchResults, - AggregateQueryFeedback, SearchHit, EsQuery + AggregateQueryFeedback, SearchHit } from '../models/index'; import * as _ from 'lodash'; import { TagService } from './tag.service'; @@ -58,33 +58,6 @@ export class ElasticSearchService { }; } - public async dateHistogramSearch( - corpusDefinition: Corpus, - queryModel: QueryModel, - fieldName: string, - timeInterval: string): Promise { - const agg = { - [fieldName]: { - date_histogram: { - field: fieldName, - calendar_interval: timeInterval - } - } - }; - const query = queryModel.toAPIQuery(); - const withAggregation = _.set(query, 'es_query.aggs', agg); - const withSize0 = _.set(withAggregation, 'es_query.size', 0); - const result = await this.execute(corpusDefinition, withSize0); - const aggregateData = {}; - Object.keys(result.aggregations).forEach(field => { - aggregateData[field] = result.aggregations[field].buckets; - }); - return { - completed: true, - aggregations: aggregateData - }; - } - /** * Load results for requested page */ diff --git a/frontend/src/app/services/search.service.ts b/frontend/src/app/services/search.service.ts index 645463b60..5dffb275b 100644 --- a/frontend/src/app/services/search.service.ts +++ b/frontend/src/app/services/search.service.ts @@ -45,20 +45,6 @@ export class SearchService { ); } - public async dateHistogramSearch( - corpus: Corpus, - queryModel: QueryModel, - fieldName: string, - timeInterval: string - ): Promise { - return this.elasticSearchService.dateHistogramSearch( - corpus, - queryModel, - fieldName, - timeInterval - ); - } - /** filter search results for fields included in resultsOverview of the corpus */ private filterResultsFields(results: SearchResults, queryModel: QueryModel): SearchResults { return { diff --git a/frontend/src/app/visualization/barchart/timeline.component.ts b/frontend/src/app/visualization/barchart/timeline.component.ts index d8adc2857..a5029a64d 100644 --- a/frontend/src/app/visualization/barchart/timeline.component.ts +++ b/frontend/src/app/visualization/barchart/timeline.component.ts @@ -2,15 +2,16 @@ import { Component, OnChanges, OnInit } from '@angular/core'; import * as _ from 'lodash'; -import { QueryModel, AggregateResult, TimelineSeries, TimelineDataPoint, TermFrequencyResult, +import { QueryModel, AggregateResult, TimelineSeries, TimelineDataPoint, TimeCategory, DateFilterData, - BarchartSeries} from '../../models/index'; +} from '../../models/index'; import { BarchartDirective } from './barchart.directive'; import * as moment from 'moment'; import 'chartjs-adapter-moment'; import { selectColor } from '../../utils/select-color'; import { showLoading } from '../../utils/utils'; +import { DateHistogramAggregator } from '../../models/aggregation'; @Component({ @@ -63,12 +64,11 @@ export class TimelineComponent * True when retrieving results for the entire series, false when retrieving a window. */ requestSeriesDocCounts(queryModel: QueryModel) { - return this.searchService.dateHistogramSearch( - this.corpus, - queryModel, - this.visualizedField.name, + const aggregation = new DateHistogramAggregator( + this.visualizedField, this.currentTimeCategory ); + return this.searchService.aggregateSearch(this.corpus, queryModel, [aggregation]); } requestSeriesTermFrequency(series: TimelineSeries, queryModel: QueryModel) { From 710f3d586553776c3be000eeaa5e83e565e5e0e1 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 16:59:48 +0200 Subject: [PATCH 03/94] handle aggretation results through Aggregator class --- .../multiple-choice-filter.component.ts | 14 ++- frontend/src/app/models/aggregation.ts | 93 ++++++++++++++++--- frontend/src/app/models/frequent-words.ts | 6 +- frontend/src/app/models/search-results.ts | 19 +--- frontend/src/app/models/visualization.ts | 8 +- frontend/src/app/services/api.service.ts | 6 +- .../services/elastic-search.service.spec.ts | 4 +- .../app/services/elastic-search.service.ts | 24 ++--- frontend/src/app/services/search.service.ts | 9 +- .../src/app/services/visualization.service.ts | 4 +- .../barchart/barchart.directive.ts | 16 ++-- .../barchart/histogram.component.ts | 11 +-- .../barchart/timeline.component.ts | 10 +- .../wordcloud/wordcloud.component.ts | 12 +-- frontend/src/mock-data/search.ts | 30 +++--- 15 files changed, 151 insertions(+), 115 deletions(-) diff --git a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts index 3eb1b8396..2db9172ca 100644 --- a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts +++ b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts @@ -33,14 +33,12 @@ export class MultipleChoiceFilterComponent extends BaseFilterComponent response.aggregations[this.filter.corpusField.name]).then(aggregations => - this.options = _.sortBy( - aggregations.map(x => ({ label: x.key, value: x.key, doc_count: x.doc_count })), - o => o.label - ) - ).catch(() => this.options = []); - + this.searchService.aggregateSearch(queryModel.corpus, queryModel, aggregator).then(result => + this.options = _.sortBy( + result.map(x => ({ label: x.key, value: x.key, doc_count: x.doc_count })), + o => o.label + ) + ).catch(() => this.options = []); } } } diff --git a/frontend/src/app/models/aggregation.ts b/frontend/src/app/models/aggregation.ts index e3a507411..4572c6d35 100644 --- a/frontend/src/app/models/aggregation.ts +++ b/frontend/src/app/models/aggregation.ts @@ -15,22 +15,49 @@ interface EsDateHistogramAggregator { }; } -export type EsAggregator = EsTermsAggregator | EsDateHistogramAggregator; +interface EsMinAggregator { + min: { + field: string; + }; +} + +interface EsMaxAggregator { + max: { + field: string; + }; +} + +export type EsAggregator = EsTermsAggregator | EsDateHistogramAggregator | EsMinAggregator | EsMaxAggregator; + +export abstract class Aggregator { + abstract aggName: string; + + constructor(protected field: CorpusField) { + } -export abstract class Aggregator { - name: string; + get name(): string { + return `${this.aggName}_${this.field.name}`; + } abstract toEsAggregator(): EsAggregator; + + abstract parseEsResult(data: any): Result; +} + +export interface TermsResult { + key: string; + doc_count: number; } -export class TermsAggregator extends Aggregator { +export class TermsAggregator extends Aggregator { + aggName = 'terms'; + constructor( - private field: CorpusField, + field: CorpusField, private maxSize?: number, private minDocCount?: number ) { - super(); - this.name = field.name; + super(field); } toEsAggregator(): EsTermsAggregator { @@ -42,15 +69,26 @@ export class TermsAggregator extends Aggregator { } }; } + + parseEsResult(data: any): TermsResult[] { + return data.buckets as TermsResult[]; + } +} + +export interface DateHistogramResult { + key: number; + key_as_string: string; + doc_count: number; } -export class DateHistogramAggregator extends Aggregator { +export class DateHistogramAggregator extends Aggregator { + aggName = 'date_histogram'; + constructor( - private field: CorpusField, + field: CorpusField, private timeInterval?: string, ) { - super(); - this.name = field.name; + super(field); } toEsAggregator(): EsDateHistogramAggregator { @@ -61,4 +99,37 @@ export class DateHistogramAggregator extends Aggregator { } }; } + + parseEsResult(data: any): DateHistogramResult[] { + return data.buckets as DateHistogramResult[]; + } +} + + +export class MinAggregator extends Aggregator { + aggName = 'min'; + + toEsAggregator(): EsMinAggregator { + return { + min: { field: this.field.name } + }; + } + + parseEsResult(data: any): number { + return data.value as number; + } +} + +export class MaxAggregator extends Aggregator { + aggName = 'max'; + + toEsAggregator(): EsMaxAggregator { + return { + max: { field: this.field.name } + }; + } + + parseEsResult(data: any): number { + return data.value as number; + } } diff --git a/frontend/src/app/models/frequent-words.ts b/frontend/src/app/models/frequent-words.ts index 77e7194f7..f7afa02e2 100644 --- a/frontend/src/app/models/frequent-words.ts +++ b/frontend/src/app/models/frequent-words.ts @@ -1,6 +1,6 @@ import { Observable, of } from 'rxjs'; import { Results } from './results'; -import { AggregateResult } from './search-results'; +import { MostFrequentWordsResult } from './search-results'; import { Params } from '@angular/router'; import { VisualizationService } from '../services'; import { Store } from '../store/types'; @@ -15,7 +15,7 @@ interface FrequentWordsParameters { const BATCH_SIZE = 1000; /** collects a the most frequent words in a text field (based on a query) */ -export class FrequentWordsResults extends Results { +export class FrequentWordsResults extends Results { constructor( store: Store, query: QueryModel, private visualizationService: VisualizationService @@ -25,7 +25,7 @@ export class FrequentWordsResults extends Results { + fetch(): Observable { const field = this.state$.value.field; if (!field) { return of(undefined); } return this.visualizationService.getWordcloudData( diff --git a/frontend/src/app/models/search-results.ts b/frontend/src/app/models/search-results.ts index 352c56f00..ad1593a51 100644 --- a/frontend/src/app/models/search-results.ts +++ b/frontend/src/app/models/search-results.ts @@ -27,23 +27,12 @@ export interface ResultOverview { resultsCount: number; }; -export interface AggregateQueryFeedback { - completed: boolean; - aggregations: AggregateData; -}; - -export interface AggregateFrequencyResults { - success: boolean; - message?: string; - data?: AggregateResult[]; -}; - -export interface AggregateResult { +export interface MostFrequentWordsResult { key: string; doc_count: number; - key_as_string?: string; }; + export interface GeoDocument { id: string; coordinates: { @@ -62,10 +51,6 @@ export interface DateResult { doc_count: number; }; -export interface AggregateData { - [fieldName: string]: AggregateResult[]; -}; - export interface WordSimilarity { key: string; similarity: number; diff --git a/frontend/src/app/models/visualization.ts b/frontend/src/app/models/visualization.ts index f65a85f85..6fe8fac7e 100644 --- a/frontend/src/app/models/visualization.ts +++ b/frontend/src/app/models/visualization.ts @@ -1,4 +1,5 @@ -import { AggregateResult, DateResult } from '.'; +import { DateResult } from '.'; +import { TermsResult } from './aggregation'; import { APIQuery } from './search-requests'; export interface TermFrequencyResult { @@ -31,9 +32,6 @@ export interface TimelineDataPoint { matches_by_doc_count?: number; } -// common type for all histogram/timeline results -export type BarchartResult = DateResult|AggregateResult; - /** * Dataseries for barcharts. * Each dataseries defines its own query text @@ -48,7 +46,7 @@ export type BarchartResult = DateResult|AggregateResult; queryText?: string; // replaces the text in this.queryModel when searching } -export type HistogramSeries = BarchartSeries; +export type HistogramSeries = BarchartSeries; export type TimelineSeries = BarchartSeries; diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts index 69ea728ec..f2d33bea7 100644 --- a/frontend/src/app/services/api.service.ts +++ b/frontend/src/app/services/api.service.ts @@ -6,7 +6,6 @@ import { interval, Observable } from 'rxjs'; import { filter, switchMap, take, takeUntil } from 'rxjs/operators'; import { ImageInfo } from '../image-view/image-view.component'; import { - AggregateResult, AggregateTermFrequencyParameters, Corpus, CorpusDocumentationPage, @@ -18,6 +17,7 @@ import { FoundDocument, GeoDocument, LimitedResultsDownloadParameters, + MostFrequentWordsResult, NGramRequestParameters, QueryDb, ResultsDownloadParameters, @@ -137,9 +137,9 @@ export class ApiService { } // Visualization - public wordCloud(data: WordcloudParameters): Observable { + public wordCloud(data: WordcloudParameters): Observable { const url = this.apiRoute(this.visApiURL, 'wordcloud'); - return this.http.post(url, data); + return this.http.post(url, data); } public geoData(data: WordcloudParameters): Promise { diff --git a/frontend/src/app/services/elastic-search.service.spec.ts b/frontend/src/app/services/elastic-search.service.spec.ts index bed867cd2..53841cc52 100644 --- a/frontend/src/app/services/elastic-search.service.spec.ts +++ b/frontend/src/app/services/elastic-search.service.spec.ts @@ -48,7 +48,7 @@ const mockAggregationResponse: SearchResponse = { hits: [], }, aggregations: { - great_field: { + terms_great_field: { buckets: [ { key: 'test', doc_count: 15 }, { key: 'testtest', doc_count: 5 }, @@ -108,7 +108,7 @@ describe('ElasticSearchService', () => { const response = service.aggregateSearch( mockCorpus, queryModel, - [aggregator] + aggregator ); const searchUrl = `/api/es/${mockCorpus.name}/_search`; diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts index 685113389..8c4df1822 100644 --- a/frontend/src/app/services/elastic-search.service.ts +++ b/frontend/src/app/services/elastic-search.service.ts @@ -4,7 +4,7 @@ import { Injectable } from '@angular/core'; import { HttpClient } from '@angular/common/http'; import { FoundDocument, Corpus, QueryModel, SearchResults, - AggregateQueryFeedback, SearchHit + SearchHit } from '../models/index'; import * as _ from 'lodash'; import { TagService } from './tag.service'; @@ -35,27 +35,19 @@ export class ElasticSearchService { .then(this.firstDocumentFromResults.bind(this)); } - public async aggregateSearch( + public async aggregateSearch( corpusDefinition: Corpus, queryModel: QueryModel, - aggregators: Aggregator[] - ): Promise { - const aggregations = {}; - aggregators.forEach(d => { - aggregations[d.name] = d.toEsAggregator(); - }); + aggregator: Aggregator + ): Promise { + const aggregations = { + [aggregator.name]: aggregator.toEsAggregator() + }; const query = queryModel.toAPIQuery(); const withAggregation = _.set(query, 'es_query.aggs', aggregations); const withSize0 = _.set(withAggregation, 'es_query.size', 0); const result = await this.execute(corpusDefinition, withSize0); - const aggregateData = {}; - Object.keys(result.aggregations).forEach(fieldName => { - aggregateData[fieldName] = result.aggregations[fieldName].buckets; - }); - return { - completed: true, - aggregations: aggregateData - }; + return aggregator.parseEsResult(result.aggregations[aggregator.name]); } /** diff --git a/frontend/src/app/services/search.service.ts b/frontend/src/app/services/search.service.ts index 5dffb275b..e45eaffba 100644 --- a/frontend/src/app/services/search.service.ts +++ b/frontend/src/app/services/search.service.ts @@ -4,7 +4,6 @@ import { ApiService } from './api.service'; import { ElasticSearchService } from './elastic-search.service'; import { Corpus, QueryModel, SearchResults, - AggregateQueryFeedback } from '../models/index'; import { PageResultsParameters } from '../models/page-results'; import { Aggregator } from '../models/aggregation'; @@ -33,15 +32,15 @@ export class SearchService { return this.filterResultsFields(results, queryModel); } - public async aggregateSearch( + public async aggregateSearch( corpus: Corpus, queryModel: QueryModel, - aggregators: Aggregator[], - ): Promise { + aggregator: Aggregator, + ): Promise { return this.elasticSearchService.aggregateSearch( corpus, queryModel, - aggregators + aggregator ); } diff --git a/frontend/src/app/services/visualization.service.ts b/frontend/src/app/services/visualization.service.ts index 7d9a38291..2d7119910 100644 --- a/frontend/src/app/services/visualization.service.ts +++ b/frontend/src/app/services/visualization.service.ts @@ -1,10 +1,10 @@ import { Injectable } from '@angular/core'; import { - AggregateResult, AggregateTermFrequencyParameters, Corpus, DateTermFrequencyParameters, GeoDocument, + MostFrequentWordsResult, NGramRequestParameters, NgramParameters, QueryModel, @@ -26,7 +26,7 @@ export class VisualizationService { public getWordcloudData(fieldName: string, queryModel: QueryModel, corpus: Corpus, size: number): - Observable { + Observable { const query = queryModel.toAPIQuery(); return this.apiService.wordCloud({ ...query, diff --git a/frontend/src/app/visualization/barchart/barchart.directive.ts b/frontend/src/app/visualization/barchart/barchart.directive.ts index 981252414..16852c217 100644 --- a/frontend/src/app/visualization/barchart/barchart.directive.ts +++ b/frontend/src/app/visualization/barchart/barchart.directive.ts @@ -6,8 +6,8 @@ import * as _ from 'lodash'; import { ApiService, NotificationService, SearchService } from '../../services/index'; import { Chart, ChartOptions } from 'chart.js'; import { - AggregateResult, Corpus, FreqTableHeaders, QueryModel, CorpusField, TaskResult, - BarchartSeries, AggregateQueryFeedback, TimelineDataPoint, HistogramDataPoint, TermFrequencyResult, ChartParameters + Corpus, FreqTableHeaders, QueryModel, CorpusField, TaskResult, + BarchartSeries, TimelineDataPoint, HistogramDataPoint, TermFrequencyResult, ChartParameters } from '../../models'; import Zoom from 'chartjs-plugin-zoom'; import { BehaviorSubject, Subject } from 'rxjs'; @@ -15,6 +15,7 @@ import { selectColor } from '../../utils/select-color'; import { VisualizationService } from '../../services/visualization.service'; import { showLoading } from '../../utils/utils'; import { takeUntil } from 'rxjs/operators'; +import { DateHistogramResult, TermsResult } from '../../models/aggregation'; const hintSeenSessionStorageKey = 'hasSeenTimelineZoomingHint'; const hintHidingMinDelay = 500; // milliseconds @@ -28,6 +29,7 @@ const barchartID = 'barchart'; /** The barchartComponent is used to define shared functionality between the * histogram and timeline components. It does not function as a stand-alone component. */ export abstract class BarchartDirective< + AggregateResult extends TermsResult | DateHistogramResult, DataPoint extends TimelineDataPoint | HistogramDataPoint > implements OnChanges, OnInit, OnDestroy { @HostBinding('style.display') display = 'block'; // needed for loading spinner positioning @@ -339,14 +341,12 @@ export abstract class BarchartDirective< * @returns a copy of the series with the document counts included. */ docCountResultIntoSeries( - result, + result: AggregateResult[], series: BarchartSeries, setSearchRatio = true ): BarchartSeries { - let data = result.aggregations[this.visualizedField.name].map( - this.aggregateResultToDataPoint - ); - const total_doc_count = this.totalDocCount(data); + let data = result.map(this.aggregateResultToDataPoint); + const total_doc_count = this.totalDocCount(result); const searchRatio = setSearchRatio ? this.documentLimit / total_doc_count : series.searchRatio; @@ -533,7 +533,7 @@ export abstract class BarchartDirective< /** Request doc counts for a series */ abstract requestSeriesDocCounts( queryModel: QueryModel - ): Promise; + ): Promise; requestFullData() { this.fullDataRequest() diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts index e08f28de7..a73f630fd 100644 --- a/frontend/src/app/visualization/barchart/histogram.component.ts +++ b/frontend/src/app/visualization/barchart/histogram.component.ts @@ -2,7 +2,6 @@ import { Component, OnChanges, OnInit } from '@angular/core'; import * as _ from 'lodash'; import { - AggregateResult, HistogramDataPoint, HistogramSeries, MultipleChoiceFilterOptions, @@ -10,7 +9,7 @@ import { RangeFilterOptions} from '../../models/index'; import { selectColor } from '../../utils/select-color'; import { BarchartDirective } from './barchart.directive'; -import { TermsAggregator } from '../../models/aggregation'; +import { TermsAggregator, TermsResult } from '../../models/aggregation'; function formatXAxisLabel(value): string { const label = this.getLabelForValue(value); // from chartJS api @@ -27,7 +26,7 @@ function formatXAxisLabel(value): string { styleUrls: ['./histogram.component.scss'], }) export class HistogramComponent - extends BarchartDirective + extends BarchartDirective implements OnInit, OnChanges { /** On what property should the data be sorted? */ get defaultSort(): string { @@ -61,12 +60,10 @@ export class HistogramComponent requestSeriesDocCounts(queryModel: QueryModel) { const aggregator = this.getAggregator(); - return this.searchService.aggregateSearch(this.corpus, queryModel, [ - aggregator, - ]); + return this.searchService.aggregateSearch(this.corpus, queryModel, aggregator); } - aggregateResultToDataPoint(cat: AggregateResult) { + aggregateResultToDataPoint(cat: TermsResult) { return cat; } diff --git a/frontend/src/app/visualization/barchart/timeline.component.ts b/frontend/src/app/visualization/barchart/timeline.component.ts index a5029a64d..b7be5a80a 100644 --- a/frontend/src/app/visualization/barchart/timeline.component.ts +++ b/frontend/src/app/visualization/barchart/timeline.component.ts @@ -2,7 +2,7 @@ import { Component, OnChanges, OnInit } from '@angular/core'; import * as _ from 'lodash'; -import { QueryModel, AggregateResult, TimelineSeries, TimelineDataPoint, +import { QueryModel, TimelineSeries, TimelineDataPoint, TimeCategory, DateFilterData, } from '../../models/index'; @@ -11,7 +11,7 @@ import * as moment from 'moment'; import 'chartjs-adapter-moment'; import { selectColor } from '../../utils/select-color'; import { showLoading } from '../../utils/utils'; -import { DateHistogramAggregator } from '../../models/aggregation'; +import { DateHistogramAggregator, DateHistogramResult } from '../../models/aggregation'; @Component({ @@ -20,7 +20,7 @@ import { DateHistogramAggregator } from '../../models/aggregation'; styleUrls: ['./timeline.component.scss'], }) export class TimelineComponent - extends BarchartDirective + extends BarchartDirective implements OnChanges, OnInit { /** domain on the axis */ public xDomain: [Date, Date]; @@ -47,7 +47,7 @@ export class TimelineComponent this.currentTimeCategory = this.calculateTimeCategory(min, max); } - aggregateResultToDataPoint(cat: AggregateResult): TimelineDataPoint { + aggregateResultToDataPoint(cat: DateHistogramResult): TimelineDataPoint { /* date fields are returned with keys containing identifiers by elasticsearch replace with string representation, contained in 'key_as_string' field */ @@ -68,7 +68,7 @@ export class TimelineComponent this.visualizedField, this.currentTimeCategory ); - return this.searchService.aggregateSearch(this.corpus, queryModel, [aggregation]); + return this.searchService.aggregateSearch(this.corpus, queryModel, aggregation); } requestSeriesTermFrequency(series: TimelineSeries, queryModel: QueryModel) { diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts index 0329a3dec..53cacee52 100644 --- a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts +++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts @@ -3,7 +3,7 @@ import { } from '@angular/core'; -import { AggregateResult, QueryModel, FreqTableHeaders } from '../../models/index'; +import { MostFrequentWordsResult, QueryModel, FreqTableHeaders } from '../../models/index'; import { VisualizationService } from '../../services/visualization.service'; import { Chart, ChartData, ChartDataset, ChartOptions, ScriptableContext, TooltipItem } from 'chart.js'; import { WordCloudChart } from 'chartjs-chart-wordcloud'; @@ -83,7 +83,7 @@ export class WordcloudComponent implements OnChanges, OnDestroy { this.wordcloudError.emit(error?.message); } - makeChart(result: AggregateResult[]) { + makeChart(result: MostFrequentWordsResult[]) { if (!this.asTable) { const data = this.chartData(result); const options = this.chartOptions(result); @@ -97,7 +97,7 @@ export class WordcloudComponent implements OnChanges, OnDestroy { } } - private chartData(result: AggregateResult[]): ChartData<'wordCloud'> { + private chartData(result: MostFrequentWordsResult[]): ChartData<'wordCloud'> { if (result) { const labels = this.chartLabels(result); const datasets = [this.chartDataset(result)]; @@ -106,11 +106,11 @@ export class WordcloudComponent implements OnChanges, OnDestroy { return { labels: [], datasets: [] }; } - private chartLabels(result: AggregateResult[]): string[] { + private chartLabels(result: MostFrequentWordsResult[]): string[] { return result.map((item) => item.key); } - private chartDataset(result: AggregateResult[]): ChartDataset<'wordCloud'> { + private chartDataset(result: MostFrequentWordsResult[]): ChartDataset<'wordCloud'> { const frequencies = result.map((item) => item.doc_count); const scale = sizeScale(_.min(frequencies), _.max(frequencies)); const sizes = frequencies.map(scale); @@ -134,7 +134,7 @@ export class WordcloudComponent implements OnChanges, OnDestroy { return (context) => selectColor(palette, context.dataIndex); } - private chartOptions(data: AggregateResult[]): ChartOptions<'wordCloud'> { + private chartOptions(data: MostFrequentWordsResult[]): ChartOptions<'wordCloud'> { return { plugins: { legend: { diff --git a/frontend/src/mock-data/search.ts b/frontend/src/mock-data/search.ts index 421327328..d5c4f233b 100644 --- a/frontend/src/mock-data/search.ts +++ b/frontend/src/mock-data/search.ts @@ -1,29 +1,25 @@ import { PageResultsParameters } from '../app/models/page-results'; import { SearchFilter } from '../app/models/field-filter'; -import { AggregateQueryFeedback, Corpus, CorpusField, FoundDocument, QueryModel, SearchResults } from '../app/models/index'; +import { Corpus, CorpusField, FoundDocument, QueryModel, SearchResults } from '../app/models/index'; import { mockCorpus } from './corpus'; import { TagServiceMock } from './tag'; import { TagService } from '../app/services/tag.service'; import { Aggregator } from '../app/models/aggregation'; export class SearchServiceMock { - public async aggregateSearch(corpus: Corpus, queryModel: QueryModel, aggregators: Aggregator[]): Promise { - const name = aggregators[0].name; - return { - completed: false, - aggregations: { - [name]: [{ - key: '1999', - doc_count: 200 - }, { - key: '2000', - doc_count: 300 - }, { - key: '2001', - doc_count: 400 - }] + public async aggregateSearch(corpus: Corpus, queryModel: QueryModel, aggregator: Aggregator): Promise { + return [ + { + key: '1999', + doc_count: 200 + }, { + key: '2000', + doc_count: 300 + }, { + key: '2001', + doc_count: 400 } - }; + ]; } public async getRelatedWords() {} From 6cfe4bdae90db7c6556fcb80b9b30192095b777f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 17:09:39 +0200 Subject: [PATCH 04/94] fetch min date for range filter component --- .../range-filter/range-filter.component.html | 14 ++++--- .../range-filter/range-filter.component.ts | 39 ++++++++++++++++++- frontend/src/app/models/base-filter.ts | 12 +++++- 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/frontend/src/app/filter/range-filter/range-filter.component.html b/frontend/src/app/filter/range-filter/range-filter.component.html index 4aaca7b6d..5e76322e5 100644 --- a/frontend/src/app/filter/range-filter/range-filter.component.html +++ b/frontend/src/app/filter/range-filter/range-filter.component.html @@ -1,6 +1,8 @@ -
- {{data.min}} - {{data.max}} - -
+ +
+ {{data.min}} - {{data.max}} + +
+
diff --git a/frontend/src/app/filter/range-filter/range-filter.component.ts b/frontend/src/app/filter/range-filter/range-filter.component.ts index 81c54b185..1343539f9 100644 --- a/frontend/src/app/filter/range-filter/range-filter.component.ts +++ b/frontend/src/app/filter/range-filter/range-filter.component.ts @@ -4,6 +4,8 @@ import { RangeFilterData, RangeFilter } from '../../models'; import { BaseFilterComponent } from '../base-filter.component'; import { Subject, interval } from 'rxjs'; import { debounce, takeUntil } from 'rxjs/operators'; +import { MaxAggregator, MinAggregator } from '../../models/aggregation'; +import { SearchService } from '../../services'; @Component({ selector: 'ia-range-filter', @@ -18,6 +20,10 @@ export class RangeFilterComponent extends BaseFilterComponent imple private destroy$ = new Subject(); + constructor(private searchService: SearchService) { + super(); + } + ngOnInit(): void { this.sliderValue$.pipe( takeUntil(this.destroy$), @@ -33,8 +39,12 @@ export class RangeFilterComponent extends BaseFilterComponent imple } onFilterSet(filter: RangeFilter): void { - this.min = filter.defaultData.min; - this.max = filter.defaultData.max; + this.fetchDefaultData(filter).then(data => + filter.setDefaultData(data) + ).then(() => { + this.min = filter.defaultData.min; + this.max = filter.defaultData.max; + }); } getFilterData(value: [number, number]): RangeFilterData { @@ -44,4 +54,29 @@ export class RangeFilterComponent extends BaseFilterComponent imple }; } + private fetchDefaultData(filter: RangeFilter): Promise { + return Promise.all( + [this.fetchMin(filter), this.fetchMax(filter)] + ).then(([min, max]) => ({min, max})); + } + + private fetchMin(filter: RangeFilter): Promise { + if (filter.defaultData.min) { + return Promise.resolve(filter.defaultData.min); + } + const aggregator = new MinAggregator(filter.corpusField); + return this.searchService.aggregateSearch( + this.queryModel.corpus, this.queryModel, aggregator + ); + } + + private fetchMax(filter: RangeFilter): Promise { + if (filter.defaultData.max) { + return Promise.resolve(filter.defaultData.max); + } + const aggregator = new MaxAggregator(filter.corpusField); + return this.searchService.aggregateSearch( + this.queryModel.corpus, this.queryModel, aggregator + ); + } } diff --git a/frontend/src/app/models/base-filter.ts b/frontend/src/app/models/base-filter.ts index a6eac8ee8..1dd3028ee 100644 --- a/frontend/src/app/models/base-filter.ts +++ b/frontend/src/app/models/base-filter.ts @@ -210,7 +210,17 @@ export abstract class BaseFilter }; } - private isDefault(data: FilterData): boolean { + /** change the default state of the filter */ + setDefaultData(data: FilterData): void { + if (this.isDefault(this.currentData)) { + this.defaultData = data; + this.reset(); + } else { + this.defaultData = data; + } + } + + protected isDefault(data: FilterData): boolean { return _.isEqual(data, this.defaultData); } From 901b1f986a4804c7a2e97b9e8e8fa27ec91d20bf Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 17:29:03 +0200 Subject: [PATCH 05/94] fetch min and max date for date filter --- .../date-filter/date-filter.component.ts | 50 ++++++++++++++++--- frontend/src/app/models/aggregation.ts | 28 +++++++++++ 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/frontend/src/app/filter/date-filter/date-filter.component.ts b/frontend/src/app/filter/date-filter/date-filter.component.ts index 80fb09467..027bfaf00 100644 --- a/frontend/src/app/filter/date-filter/date-filter.component.ts +++ b/frontend/src/app/filter/date-filter/date-filter.component.ts @@ -1,9 +1,11 @@ import { Component } from '@angular/core'; import * as _ from 'lodash'; -import { DateFilter } from '../../models'; +import { DateFilter, DateFilterData } from '../../models'; import { BaseFilterComponent } from '../base-filter.component'; import { BehaviorSubject, combineLatest } from 'rxjs'; +import { MaxDateAggregator, MinDateAggregator } from '../../models/aggregation'; +import { SearchService } from '../../services'; @Component({ selector: 'ia-date-filter', @@ -17,15 +19,49 @@ export class DateFilterComponent extends BaseFilterComponent { public selectedMinDate: BehaviorSubject; public selectedMaxDate: BehaviorSubject; + constructor(private searchService: SearchService) { + super(); + } + onFilterSet(filter: DateFilter): void { - this.minDate = filter.defaultData.min; - this.maxDate = filter.defaultData.max; + this.fetchDefaultData(filter).then(data => { + filter.setDefaultData(data); + }).then(() => { + this.minDate = filter.defaultData.min; + this.maxDate = filter.defaultData.max; + + this.selectedMinDate = new BehaviorSubject(filter.currentData.min); + this.selectedMaxDate = new BehaviorSubject(filter.currentData.max); + + combineLatest([this.selectedMinDate, this.selectedMaxDate]).subscribe(([min, max]) => + this.update({ min, max }) + ); + }); + } - this.selectedMinDate = new BehaviorSubject(filter.currentData.min); - this.selectedMaxDate = new BehaviorSubject(filter.currentData.max); + private fetchDefaultData(filter: DateFilter): Promise { + return Promise.all( + [this.fetchMin(filter), this.fetchMax(filter)] + ).then(([min, max]) => ({min, max})); + } + + private fetchMin(filter: DateFilter): Promise { + if (filter.defaultData.min) { + return Promise.resolve(filter.defaultData.min); + } + const aggregator = new MinDateAggregator(filter.corpusField); + return this.searchService.aggregateSearch( + this.queryModel.corpus, this.queryModel, aggregator + ); + } - combineLatest([this.selectedMinDate, this.selectedMaxDate]).subscribe(([min, max]) => - this.update({ min, max }) + private fetchMax(filter: DateFilter): Promise { + if (filter.defaultData.max) { + return Promise.resolve(filter.defaultData.max); + } + const aggregator = new MaxDateAggregator(filter.corpusField); + return this.searchService.aggregateSearch( + this.queryModel.corpus, this.queryModel, aggregator ); } diff --git a/frontend/src/app/models/aggregation.ts b/frontend/src/app/models/aggregation.ts index 4572c6d35..ded4e4a24 100644 --- a/frontend/src/app/models/aggregation.ts +++ b/frontend/src/app/models/aggregation.ts @@ -133,3 +133,31 @@ export class MaxAggregator extends Aggregator { return data.value as number; } } + +export class MinDateAggregator extends Aggregator { + aggName = 'min'; + + toEsAggregator(): EsMinAggregator { + return { + min: { field: this.field.name } + }; + } + + parseEsResult(data: any): Date { + return new Date(data.value); + } +} + +export class MaxDateAggregator extends Aggregator { + aggName = 'max'; + + toEsAggregator(): EsMaxAggregator { + return { + max: { field: this.field.name } + }; + } + + parseEsResult(data: any): Date { + return new Date(data.value); + } +} From 567ccb90d8dadee220bbbcaa7455617e77e7ae84 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 18:16:03 +0200 Subject: [PATCH 06/94] add tests for date filter component --- .../date-filter/date-filter.component.spec.ts | 55 ++++++++++++++++--- .../src/app/models/field-filter-options.ts | 4 +- frontend/src/app/models/field-filter.ts | 30 ++++++---- 3 files changed, 68 insertions(+), 21 deletions(-) diff --git a/frontend/src/app/filter/date-filter/date-filter.component.spec.ts b/frontend/src/app/filter/date-filter/date-filter.component.spec.ts index f738f0fc9..ee6f54b75 100644 --- a/frontend/src/app/filter/date-filter/date-filter.component.spec.ts +++ b/frontend/src/app/filter/date-filter/date-filter.component.spec.ts @@ -1,4 +1,4 @@ -import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import { ComponentFixture, TestBed, fakeAsync, flushMicrotasks, waitForAsync } from '@angular/core/testing'; import { mockCorpus3, mockFieldDate } from '../../../mock-data/corpus'; import { commonTestBed } from '../../common-test-bed'; @@ -6,14 +6,13 @@ import { DateFilter, DateFilterData, QueryModel } from '../../models'; import { DateFilterComponent } from './date-filter.component'; import { SimpleStore } from '../../store/simple-store'; +import { SearchService } from '../../services'; +import * as _ from 'lodash'; describe('DateFilterComponent', () => { let component: DateFilterComponent; + let searchService: SearchService; let fixture: ComponentFixture; - const exampleData0: DateFilterData = { - min: new Date(Date.parse('Jan 01 1810')), - max: new Date(Date.parse('Dec 31 1820')) - }; const exampleData1: DateFilterData = { min: new Date(Date.parse('Jan 01 1850')), max: new Date(Date.parse('Dec 31 1860')) @@ -24,6 +23,7 @@ describe('DateFilterComponent', () => { })); beforeEach(() => { + searchService = TestBed.inject(SearchService); fixture = TestBed.createComponent(DateFilterComponent); component = fixture.componentInstance; component.queryModel = new QueryModel(mockCorpus3); @@ -46,11 +46,52 @@ describe('DateFilterComponent', () => { expect(component.filter.currentData.max).toEqual(exampleData1.max); }); - it('should create a new update when onFilterSet is called', () => { + it('should create a new update when onFilterSet is called', fakeAsync(() => { + spyOn(searchService, 'aggregateSearch').and.returnValue( + Promise.resolve(new Date(Date.now())) + ); + const newFilter = new DateFilter(new SimpleStore(), mockFieldDate); newFilter.set(exampleData1); component.onFilterSet(newFilter); + + flushMicrotasks(); + expect(component.selectedMinDate.value).toEqual(exampleData1.min); expect(component.selectedMaxDate.value).toEqual(exampleData1.max); - }); + })); + + it('should use the specified data range', fakeAsync(() => { + const newFilter = new DateFilter(new SimpleStore(), mockFieldDate); + component.onFilterSet(newFilter); + + flushMicrotasks(); + + expect(component.minDate.getDate()) + .toEqual(new Date(Date.parse('1800-01-01')).getDate()); + expect(component.maxDate.getDate()) + .toEqual(new Date(Date.parse('1899-12-31')).getDate()); + })); + + it('should fetch the data range when not specified', fakeAsync(() => { + const minDate = new Date(Date.parse('1820-01-01')); + const maxDate = new Date(Date.parse('1880-01-01')); + spyOn(searchService, 'aggregateSearch').and.returnValues( + Promise.resolve(minDate), Promise.resolve(maxDate) + ); + + const field = _.cloneDeep(mockFieldDate); + field.filterOptions = { + name: 'DateFilter', + description: '' + }; + + const newFilter = new DateFilter(new SimpleStore(), field); + component.onFilterSet(newFilter); + + flushMicrotasks(); + + expect(component.minDate).toEqual(minDate); + expect(component.maxDate).toEqual(maxDate); + })); }); diff --git a/frontend/src/app/models/field-filter-options.ts b/frontend/src/app/models/field-filter-options.ts index 1dba415a1..b66fdf028 100644 --- a/frontend/src/app/models/field-filter-options.ts +++ b/frontend/src/app/models/field-filter-options.ts @@ -8,8 +8,8 @@ export interface HasDescription { export type DateFilterOptions = { name: 'DateFilter'; - lower: string; - upper: string; + lower?: string; + upper?: string; } & HasDescription; export type MultipleChoiceFilterOptions = { diff --git a/frontend/src/app/models/field-filter.ts b/frontend/src/app/models/field-filter.ts index 2086ba185..de695eadc 100644 --- a/frontend/src/app/models/field-filter.ts +++ b/frontend/src/app/models/field-filter.ts @@ -69,8 +69,8 @@ abstract class AbstractFieldFilter } export interface DateFilterData { - min: Date; - max: Date; + min?: Date; + max?: Date; } export class DateFilter extends AbstractFieldFilter { @@ -88,7 +88,7 @@ export class DateFilter extends AbstractFieldFilter Date: Mon, 15 Apr 2024 18:23:42 +0200 Subject: [PATCH 07/94] make filter range optional in backend --- backend/addcorpus/python_corpora/filters.py | 4 ++-- backend/corpora/dbnl/dbnl.py | 1 - backend/corpora/troonredes/troonredes.py | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/addcorpus/python_corpora/filters.py b/backend/addcorpus/python_corpora/filters.py index 113b7c701..722efd4f4 100644 --- a/backend/addcorpus/python_corpora/filters.py +++ b/backend/addcorpus/python_corpora/filters.py @@ -37,7 +37,7 @@ class DateFilter(Filter): mapping_types = (MappingType.DATE, MappingType.DATE_RANGE,) - def __init__(self, lower, upper, *nargs, **kwargs): + def __init__(self, lower=None, upper=None, *nargs, **kwargs): self.lower = lower self.upper = upper super().__init__(*nargs, **kwargs) @@ -50,7 +50,7 @@ class RangeFilter(Filter): mapping_types = (MappingType.INTEGER, MappingType.FLOAT) - def __init__(self, lower, upper, *nargs, **kwargs): + def __init__(self, lower=None, upper=None, *nargs, **kwargs): self.lower = lower self.upper = upper super().__init__(*nargs, **kwargs) diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py index 33933bb44..c58201416 100644 --- a/backend/corpora/dbnl/dbnl.py +++ b/backend/corpora/dbnl/dbnl.py @@ -137,7 +137,6 @@ def _xml_files(self): es_mapping=int_mapping(), search_filter=RangeFilter( description='Select books by publication year', - lower=1200, upper=1890 ), visualizations=['resultscount', 'termfrequency'], sortable=True, diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py index 5bbcc17af..0fa5cdc01 100644 --- a/backend/corpora/troonredes/troonredes.py +++ b/backend/corpora/troonredes/troonredes.py @@ -73,10 +73,8 @@ def sources(self, start=min_date, end=max_date): results_overview=True, csv_core=True, search_filter=filters.DateFilter( - min_date, - max_date, description=( - 'Accept only articles with publication date in this range.' + 'Accept only speeches given between these dates.' ) ), sortable=True From fcb636d46d84eaf6ea865d02a7c7eba7f9641c5e Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 15 Apr 2024 18:26:28 +0200 Subject: [PATCH 08/94] allow missing bounds in datefilter/rangefilter --- .../date-filter/date-filter.component.spec.ts | 2 ++ .../src/app/models/field-filter-options.ts | 8 ++--- frontend/src/app/models/field-filter.ts | 29 ++++++++++++++----- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/frontend/src/app/filter/date-filter/date-filter.component.spec.ts b/frontend/src/app/filter/date-filter/date-filter.component.spec.ts index ee6f54b75..eb722b79c 100644 --- a/frontend/src/app/filter/date-filter/date-filter.component.spec.ts +++ b/frontend/src/app/filter/date-filter/date-filter.component.spec.ts @@ -83,6 +83,8 @@ describe('DateFilterComponent', () => { const field = _.cloneDeep(mockFieldDate); field.filterOptions = { name: 'DateFilter', + lower: null, + upper: null, description: '' }; diff --git a/frontend/src/app/models/field-filter-options.ts b/frontend/src/app/models/field-filter-options.ts index b66fdf028..e77c09874 100644 --- a/frontend/src/app/models/field-filter-options.ts +++ b/frontend/src/app/models/field-filter-options.ts @@ -8,8 +8,8 @@ export interface HasDescription { export type DateFilterOptions = { name: 'DateFilter'; - lower?: string; - upper?: string; + lower: string|null; + upper: string|null; } & HasDescription; export type MultipleChoiceFilterOptions = { @@ -19,8 +19,8 @@ export type MultipleChoiceFilterOptions = { export type RangeFilterOptions = { name: 'RangeFilter'; - lower: number; - upper: number; + lower: number|null; + upper: number|null; } & HasDescription; export type BooleanFilterOptions = { diff --git a/frontend/src/app/models/field-filter.ts b/frontend/src/app/models/field-filter.ts index de695eadc..18f37cbc2 100644 --- a/frontend/src/app/models/field-filter.ts +++ b/frontend/src/app/models/field-filter.ts @@ -76,8 +76,8 @@ export interface DateFilterData { export class DateFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: DateFilterOptions) { return { - min: this.parseDate(filterOptions.lower), - max: this.parseDate(filterOptions.upper) + min: _.isNull(filterOptions.lower) ? undefined : this.parseDate(filterOptions.lower), + max: _.isNull(filterOptions.upper) ? undefined : this.parseDate(filterOptions.upper) }; } @@ -131,7 +131,7 @@ export class DateFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: RangeFilterOptions): RangeFilterData { return { - min: filterOptions.lower, - max: filterOptions.upper + min: _.isNull(filterOptions.lower) ? undefined : filterOptions.lower, + max: _.isNull(filterOptions.upper) ? undefined : filterOptions.upper }; } @@ -228,13 +228,13 @@ export class RangeFilter extends AbstractFieldFilter { From b7a8a2e55e8c53de77b81b379cd54cc253d8b08b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 11:30:41 +0200 Subject: [PATCH 09/94] deactivate field filter when fetching min/max --- .../date-filter/date-filter.component.ts | 18 +++++++++--------- .../range-filter/range-filter.component.ts | 17 +++++++++-------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/frontend/src/app/filter/date-filter/date-filter.component.ts b/frontend/src/app/filter/date-filter/date-filter.component.ts index 027bfaf00..2476978d2 100644 --- a/frontend/src/app/filter/date-filter/date-filter.component.ts +++ b/frontend/src/app/filter/date-filter/date-filter.component.ts @@ -1,10 +1,10 @@ import { Component } from '@angular/core'; import * as _ from 'lodash'; -import { DateFilter, DateFilterData } from '../../models'; +import { DateFilter, DateFilterData, QueryModel } from '../../models'; import { BaseFilterComponent } from '../base-filter.component'; import { BehaviorSubject, combineLatest } from 'rxjs'; -import { MaxDateAggregator, MinDateAggregator } from '../../models/aggregation'; +import { Aggregator, MaxDateAggregator, MinDateAggregator } from '../../models/aggregation'; import { SearchService } from '../../services'; @Component({ @@ -49,20 +49,20 @@ export class DateFilterComponent extends BaseFilterComponent { if (filter.defaultData.min) { return Promise.resolve(filter.defaultData.min); } - const aggregator = new MinDateAggregator(filter.corpusField); - return this.searchService.aggregateSearch( - this.queryModel.corpus, this.queryModel, aggregator - ); + return this.fetchAggregation(new MinDateAggregator(filter.corpusField)); } private fetchMax(filter: DateFilter): Promise { if (filter.defaultData.max) { return Promise.resolve(filter.defaultData.max); } - const aggregator = new MaxDateAggregator(filter.corpusField); + return this.fetchAggregation(new MaxDateAggregator(filter.corpusField)); + } + + private fetchAggregation(aggregator: Aggregator): Promise { + const queryModel = new QueryModel(this.queryModel.corpus); return this.searchService.aggregateSearch( - this.queryModel.corpus, this.queryModel, aggregator + queryModel.corpus, queryModel, aggregator ); } - } diff --git a/frontend/src/app/filter/range-filter/range-filter.component.ts b/frontend/src/app/filter/range-filter/range-filter.component.ts index 1343539f9..caf419415 100644 --- a/frontend/src/app/filter/range-filter/range-filter.component.ts +++ b/frontend/src/app/filter/range-filter/range-filter.component.ts @@ -1,10 +1,10 @@ import { Component, OnDestroy, OnInit } from '@angular/core'; -import { RangeFilterData, RangeFilter } from '../../models'; +import { RangeFilterData, RangeFilter, QueryModel } from '../../models'; import { BaseFilterComponent } from '../base-filter.component'; import { Subject, interval } from 'rxjs'; import { debounce, takeUntil } from 'rxjs/operators'; -import { MaxAggregator, MinAggregator } from '../../models/aggregation'; +import { Aggregator, MaxAggregator, MinAggregator } from '../../models/aggregation'; import { SearchService } from '../../services'; @Component({ @@ -64,19 +64,20 @@ export class RangeFilterComponent extends BaseFilterComponent imple if (filter.defaultData.min) { return Promise.resolve(filter.defaultData.min); } - const aggregator = new MinAggregator(filter.corpusField); - return this.searchService.aggregateSearch( - this.queryModel.corpus, this.queryModel, aggregator - ); + return this.fetchAggregation(new MinAggregator(filter.corpusField)); } private fetchMax(filter: RangeFilter): Promise { if (filter.defaultData.max) { return Promise.resolve(filter.defaultData.max); } - const aggregator = new MaxAggregator(filter.corpusField); + return this.fetchAggregation(new MaxAggregator(filter.corpusField)); + } + + private fetchAggregation(aggregator: Aggregator): Promise { + const queryModel = new QueryModel(this.queryModel.corpus); return this.searchService.aggregateSearch( - this.queryModel.corpus, this.queryModel, aggregator + queryModel.corpus, queryModel, aggregator ); } } From 4147d4aed5648e520d887153ce62bfeffa94cb63 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 11:59:29 +0200 Subject: [PATCH 10/94] code clarity --- frontend/src/app/models/field-filter.ts | 48 +++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/frontend/src/app/models/field-filter.ts b/frontend/src/app/models/field-filter.ts index 18f37cbc2..a09141ab8 100644 --- a/frontend/src/app/models/field-filter.ts +++ b/frontend/src/app/models/field-filter.ts @@ -73,11 +73,23 @@ export interface DateFilterData { max?: Date; } +/** + * Filter for date fields + * + * Filter data is a range, e.g. [1st Jan 2000, 31st Dec 2000]. The min and max values can + * also be undefined, which means no bound in that direction. + * + * The default data is always the widest possible range. This can be specified on the + * field; if not specified, the default range will be (∞,∞). However, note that if the + * DateFilter is controlled by a DateFilterComponent, the component will fill in finite + * bounds based on an aggregation. + */ export class DateFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: DateFilterOptions) { + const parse = value => _.isNull(value) ? undefined : this.parseDate(value); return { - min: _.isNull(filterOptions.lower) ? undefined : this.parseDate(filterOptions.lower), - max: _.isNull(filterOptions.upper) ? undefined : this.parseDate(filterOptions.upper) + min: parse(filterOptions.lower), + max: parse(filterOptions.upper) }; } @@ -137,6 +149,9 @@ export class DateFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: BooleanFilterOptions) { @@ -175,6 +190,9 @@ export class BooleanFilter extends AbstractFieldFilter type MultipleChoiceFilterData = string[]; +/** + * Filter for keyword fields + */ export class MultipleChoiceFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: MultipleChoiceFilterOptions): MultipleChoiceFilterData { return []; @@ -209,15 +227,27 @@ export class MultipleChoiceFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: RangeFilterOptions): RangeFilterData { + const parse = (value) => _.isNull(value) ? undefined : value; return { - min: _.isNull(filterOptions.lower) ? undefined : filterOptions.lower, - max: _.isNull(filterOptions.upper) ? undefined : filterOptions.upper + min: parse(filterOptions.lower), + max: parse(filterOptions.upper) }; } @@ -269,6 +299,12 @@ export class RangeFilter extends AbstractFieldFilter { makeDefaultData(filterOptions: FieldFilterOptions) { } From 59fc533b124da5eda41a0df635cf219d622c8713 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 12:05:59 +0200 Subject: [PATCH 11/94] add test for range filter --- .../range-filter.component.spec.ts | 42 ++++++++++++++++++- .../range-filter/range-filter.component.ts | 5 ++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/frontend/src/app/filter/range-filter/range-filter.component.spec.ts b/frontend/src/app/filter/range-filter/range-filter.component.spec.ts index 098cfdc30..33daeaec2 100644 --- a/frontend/src/app/filter/range-filter/range-filter.component.spec.ts +++ b/frontend/src/app/filter/range-filter/range-filter.component.spec.ts @@ -1,13 +1,17 @@ -import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import { ComponentFixture, TestBed, fakeAsync, flushMicrotasks, waitForAsync } from '@angular/core/testing'; import { commonTestBed } from '../../common-test-bed'; import { RangeFilterComponent } from './range-filter.component'; import { QueryModel, RangeFilter } from '../../models'; import { mockCorpus3, mockField3 } from '../../../mock-data/corpus'; +import { SearchService } from '../../services'; +import * as _ from 'lodash'; +import { SimpleStore } from '../../store/simple-store'; describe('RangeFilterComponent', () => { let component: RangeFilterComponent; + let searchService: SearchService; let fixture: ComponentFixture; beforeEach(waitForAsync(() => { @@ -15,6 +19,7 @@ describe('RangeFilterComponent', () => { })); beforeEach(() => { + searchService = TestBed.inject(SearchService); fixture = TestBed.createComponent(RangeFilterComponent); component = fixture.componentInstance; component.queryModel = new QueryModel(mockCorpus3); @@ -26,4 +31,39 @@ describe('RangeFilterComponent', () => { it('should create', () => { expect(component).toBeTruthy(); }); + + it('should use the specified data range', fakeAsync(() => { + const newFilter = new RangeFilter(new SimpleStore(), mockField3); + component.onFilterSet(newFilter); + + flushMicrotasks(); + + expect(component.min).toEqual(0); + expect(component.max).toEqual(100); + })); + + + it('should fetch the data range when not specified', fakeAsync(() => { + const min = 300; + const max = 400; + spyOn(searchService, 'aggregateSearch').and.returnValues( + Promise.resolve(min), Promise.resolve(max) + ); + + const field = _.cloneDeep(mockField3); + field.filterOptions = { + name: 'RangeFilter', + lower: null, + upper: null, + description: '' + }; + + const newFilter = new RangeFilter(new SimpleStore(), field); + component.onFilterSet(newFilter); + + flushMicrotasks(); + + expect(component.min).toEqual(min); + expect(component.max).toEqual(max); + })); }); diff --git a/frontend/src/app/filter/range-filter/range-filter.component.ts b/frontend/src/app/filter/range-filter/range-filter.component.ts index caf419415..91936e20a 100644 --- a/frontend/src/app/filter/range-filter/range-filter.component.ts +++ b/frontend/src/app/filter/range-filter/range-filter.component.ts @@ -6,6 +6,7 @@ import { Subject, interval } from 'rxjs'; import { debounce, takeUntil } from 'rxjs/operators'; import { Aggregator, MaxAggregator, MinAggregator } from '../../models/aggregation'; import { SearchService } from '../../services'; +import * as _ from 'lodash'; @Component({ selector: 'ia-range-filter', @@ -61,14 +62,14 @@ export class RangeFilterComponent extends BaseFilterComponent imple } private fetchMin(filter: RangeFilter): Promise { - if (filter.defaultData.min) { + if (!_.isUndefined(filter.defaultData.min)) { return Promise.resolve(filter.defaultData.min); } return this.fetchAggregation(new MinAggregator(filter.corpusField)); } private fetchMax(filter: RangeFilter): Promise { - if (filter.defaultData.max) { + if (!_.isUndefined(filter.defaultData.max)) { return Promise.resolve(filter.defaultData.max); } return this.fetchAggregation(new MaxAggregator(filter.corpusField)); From 91e7a9d5fdee53efd9a2a140b0bd046a81e35cce Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:14:38 +0200 Subject: [PATCH 12/94] move basic csv mock corpus --- .../python_corpora/tests/test_save_corpus.py | 3 ++- backend/addcorpus/tests/test_citation.py | 2 +- backend/addcorpus/tests/test_csvcorpus.py | 4 +--- backend/addcorpus/tests/test_reader.py | 5 +++-- .../tests => corpora_test/csv}/citation/citation.md | 0 .../csv}/description/mock-csv-corpus.md | 0 .../tests => corpora_test/csv}/images/corpus.jpg | Bin .../tests => corpora_test/csv}/mock_basic_corpus.py | 2 +- .../tests => corpora_test/csv}/mock_csv_corpus.py | 2 +- .../csv/source_data}/example.csv | 0 backend/ianalyzer/settings_test.py | 6 ++++-- backend/media/tests/media_mock_corpus.py | 2 +- 12 files changed, 14 insertions(+), 12 deletions(-) rename backend/{addcorpus/tests => corpora_test/csv}/citation/citation.md (100%) rename backend/{addcorpus/tests => corpora_test/csv}/description/mock-csv-corpus.md (100%) rename backend/{addcorpus/tests => corpora_test/csv}/images/corpus.jpg (100%) rename backend/{addcorpus/tests => corpora_test/csv}/mock_basic_corpus.py (56%) rename backend/{addcorpus/tests => corpora_test/csv}/mock_csv_corpus.py (95%) rename backend/{addcorpus/tests/csv_example => corpora_test/csv/source_data}/example.csv (100%) diff --git a/backend/addcorpus/python_corpora/tests/test_save_corpus.py b/backend/addcorpus/python_corpora/tests/test_save_corpus.py index 41ebb5cd4..ddf8fb55c 100644 --- a/backend/addcorpus/python_corpora/tests/test_save_corpus.py +++ b/backend/addcorpus/python_corpora/tests/test_save_corpus.py @@ -1,7 +1,7 @@ import sys import pytest from django.conf import settings -from addcorpus.tests.mock_csv_corpus import MockCSVCorpus +from corpora_test.csv.mock_csv_corpus import MockCSVCorpus from addcorpus.models import Corpus, CorpusConfiguration from addcorpus.python_corpora.save_corpus import (_save_field_in_database, load_and_save_all_corpora, _save_or_skip_corpus @@ -19,6 +19,7 @@ def test_saved_corpora(db): for corpus_name in configured: assert Corpus.objects.filter(name=corpus_name).exists() corpus = Corpus.objects.get(name=corpus_name) + conf = corpus.configuration_obj assert corpus.configuration_obj assert corpus.active diff --git a/backend/addcorpus/tests/test_citation.py b/backend/addcorpus/tests/test_citation.py index 045a43924..f246c3292 100644 --- a/backend/addcorpus/tests/test_citation.py +++ b/backend/addcorpus/tests/test_citation.py @@ -15,7 +15,7 @@ @pytest.fixture() def citation_template(settings): - path = os.path.join(settings.BASE_DIR, 'addcorpus', 'tests', 'citation', 'citation.md') + path = os.path.join(settings.BASE_DIR, 'corpora_test', 'csv', 'citation', 'citation.md') with open(path) as f: return f.read() diff --git a/backend/addcorpus/tests/test_csvcorpus.py b/backend/addcorpus/tests/test_csvcorpus.py index 3a4a3f79b..51c5529b4 100644 --- a/backend/addcorpus/tests/test_csvcorpus.py +++ b/backend/addcorpus/tests/test_csvcorpus.py @@ -1,6 +1,4 @@ -import pytest - -from addcorpus.tests.mock_csv_corpus import MockCSVCorpus +from corpora_test.csv.mock_csv_corpus import MockCSVCorpus import os here = os.path.abspath(os.path.dirname(__file__)) diff --git a/backend/addcorpus/tests/test_reader.py b/backend/addcorpus/tests/test_reader.py index b126477f2..16e7ede09 100644 --- a/backend/addcorpus/tests/test_reader.py +++ b/backend/addcorpus/tests/test_reader.py @@ -1,8 +1,8 @@ import os +from django.conf import settings from addcorpus.models import Corpus from addcorpus.reader import make_reader -HERE = os.path.abspath(os.path.dirname(__file__)) def test_make_reader_python(mock_corpus): @@ -18,7 +18,8 @@ def test_make_reader_python(mock_corpus): def test_make_reader_json(json_mock_corpus): - json_mock_corpus.configuration.data_directory = os.path.join(HERE, 'csv_example') + data_dir = os.path.join(settings.BASE_DIR, 'corpora_test', 'csv', 'source_data') + json_mock_corpus.configuration.data_directory = data_dir json_mock_corpus.configuration.save() reader = make_reader(json_mock_corpus) docs = list(reader.documents()) diff --git a/backend/addcorpus/tests/citation/citation.md b/backend/corpora_test/csv/citation/citation.md similarity index 100% rename from backend/addcorpus/tests/citation/citation.md rename to backend/corpora_test/csv/citation/citation.md diff --git a/backend/addcorpus/tests/description/mock-csv-corpus.md b/backend/corpora_test/csv/description/mock-csv-corpus.md similarity index 100% rename from backend/addcorpus/tests/description/mock-csv-corpus.md rename to backend/corpora_test/csv/description/mock-csv-corpus.md diff --git a/backend/addcorpus/tests/images/corpus.jpg b/backend/corpora_test/csv/images/corpus.jpg similarity index 100% rename from backend/addcorpus/tests/images/corpus.jpg rename to backend/corpora_test/csv/images/corpus.jpg diff --git a/backend/addcorpus/tests/mock_basic_corpus.py b/backend/corpora_test/csv/mock_basic_corpus.py similarity index 56% rename from backend/addcorpus/tests/mock_basic_corpus.py rename to backend/corpora_test/csv/mock_basic_corpus.py index f554f485f..0ef6534a6 100644 --- a/backend/addcorpus/tests/mock_basic_corpus.py +++ b/backend/corpora_test/csv/mock_basic_corpus.py @@ -1,4 +1,4 @@ -from addcorpus.tests.mock_csv_corpus import MockCSVCorpus +from corpora_test.csv.mock_csv_corpus import MockCSVCorpus class MockBasicCorpus(MockCSVCorpus): diff --git a/backend/addcorpus/tests/mock_csv_corpus.py b/backend/corpora_test/csv/mock_csv_corpus.py similarity index 95% rename from backend/addcorpus/tests/mock_csv_corpus.py rename to backend/corpora_test/csv/mock_csv_corpus.py index e9d49c645..68686fa06 100644 --- a/backend/addcorpus/tests/mock_csv_corpus.py +++ b/backend/corpora_test/csv/mock_csv_corpus.py @@ -13,7 +13,7 @@ class MockCSVCorpus(CSVCorpusDefinition): es_index = 'nothing' min_date = datetime.datetime(year=1, month=1, day=1) max_date = datetime.datetime(year=2022, month=12, day=31) - data_directory = os.path.join(here, 'csv_example') + data_directory = os.path.join(here, 'source_data') citation_page = 'citation.md' field_entry = 'character' diff --git a/backend/addcorpus/tests/csv_example/example.csv b/backend/corpora_test/csv/source_data/example.csv similarity index 100% rename from backend/addcorpus/tests/csv_example/example.csv rename to backend/corpora_test/csv/source_data/example.csv diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index b4fe346d0..3e7907483 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -4,6 +4,8 @@ def path_in_testdir(app, *path_from_app_tests): return os.path.join(BASE_DIR, app, 'tests', *path_from_app_tests) +def test_corpus_path(*path): + return os.path.join(BASE_DIR, 'corpora_test', *path) CORPORA = { 'small-mock-corpus': path_in_testdir('visualization', 'mock_corpora', 'small_mock_corpus.py'), @@ -11,8 +13,8 @@ def path_in_testdir(app, *path_from_app_tests): 'multilingual-mock-corpus': path_in_testdir('download', 'mock_corpora', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), 'media-mock-corpus': path_in_testdir('media', 'media_mock_corpus.py'), - 'mock-csv-corpus': path_in_testdir('addcorpus', 'mock_csv_corpus.py'), - 'mock-basic-corpus': path_in_testdir('addcorpus', 'mock_basic_corpus.py'), + 'mock-csv-corpus': test_corpus_path('csv', 'mock_csv_corpus.py'), + 'mock-basic-corpus': test_corpus_path('csv', 'mock_basic_corpus.py'), 'wordmodels-mock-corpus': path_in_testdir('wordmodels', 'mock-corpus', 'mock_corpus.py'), 'tagging-mock-corpus': path_in_testdir('tag', 'tag_mock_corpus.py'), } diff --git a/backend/media/tests/media_mock_corpus.py b/backend/media/tests/media_mock_corpus.py index 1a96e2c15..058fd8849 100644 --- a/backend/media/tests/media_mock_corpus.py +++ b/backend/media/tests/media_mock_corpus.py @@ -1,6 +1,6 @@ import os -from addcorpus.tests.mock_csv_corpus import MockCSVCorpus +from corpora_test.csv.mock_csv_corpus import MockCSVCorpus from media.media_url import media_url here = os.path.abspath(os.path.dirname(__file__)) From c14f86640ee5be79358e7a03aa531aa8f87bd6d2 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:20:47 +0200 Subject: [PATCH 13/94] move small mock corpus --- .../mock_corpora => corpora_test/small}/small_mock_corpus.py | 2 +- .../small/source_data}/example.csv | 0 backend/ianalyzer/settings_test.py | 2 +- backend/visualization/conftest.py | 4 ++-- 4 files changed, 4 insertions(+), 4 deletions(-) rename backend/{visualization/tests/mock_corpora => corpora_test/small}/small_mock_corpus.py (97%) rename backend/{visualization/tests/mock_corpora/source_files => corpora_test/small/source_data}/example.csv (100%) diff --git a/backend/visualization/tests/mock_corpora/small_mock_corpus.py b/backend/corpora_test/small/small_mock_corpus.py similarity index 97% rename from backend/visualization/tests/mock_corpora/small_mock_corpus.py rename to backend/corpora_test/small/small_mock_corpus.py index 6d638b9c0..0a9270df4 100644 --- a/backend/visualization/tests/mock_corpora/small_mock_corpus.py +++ b/backend/corpora_test/small/small_mock_corpus.py @@ -18,7 +18,7 @@ class SmallMockCorpus(CSVCorpusDefinition): min_date = datetime(year=1800, month=1, day=1) max_date = datetime(year=1899, month=12, day=31) es_index = 'ianalyzer-mock-corpus' - data_directory = os.path.join(here, 'source_files') + data_directory = os.path.join(here, 'source_data') languages = ['en'] category = 'book' diff --git a/backend/visualization/tests/mock_corpora/source_files/example.csv b/backend/corpora_test/small/source_data/example.csv similarity index 100% rename from backend/visualization/tests/mock_corpora/source_files/example.csv rename to backend/corpora_test/small/source_data/example.csv diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 3e7907483..6742ba03d 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -8,7 +8,7 @@ def test_corpus_path(*path): return os.path.join(BASE_DIR, 'corpora_test', *path) CORPORA = { - 'small-mock-corpus': path_in_testdir('visualization', 'mock_corpora', 'small_mock_corpus.py'), + 'small-mock-corpus': test_corpus_path('small', 'small_mock_corpus.py'), 'large-mock-corpus': path_in_testdir('visualization', 'mock_corpora', 'large_mock_corpus.py'), 'multilingual-mock-corpus': path_in_testdir('download', 'mock_corpora', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), diff --git a/backend/visualization/conftest.py b/backend/visualization/conftest.py index 615e57f1d..5472630cd 100644 --- a/backend/visualization/conftest.py +++ b/backend/visualization/conftest.py @@ -3,7 +3,7 @@ import random from conftest import index_test_corpus, clear_test_corpus -from visualization.tests.mock_corpora.small_mock_corpus import SPECS as SMALL_MOCK_CORPUS_SPECS +from corpora_test.small.small_mock_corpus import SPECS as SMALL_MOCK_CORPUS_SPECS from visualization.tests.mock_corpora.large_mock_corpus import SPECS as LARGE_MOCK_CORPUS_SPECS here = os.path.abspath(os.path.dirname(__file__)) @@ -24,7 +24,7 @@ def search(self, index, size, **kwargs): 'hits': [{'_id': hit_id} for hit_id in range(min(size, self.num_hits))]}, '_scroll_id': '42' } - + def clear_scroll(self, scroll_id): return {'status': 'ok'} From 6f6f579c7dc8e79da39e35c74827ccd7c91b02a6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:23:36 +0200 Subject: [PATCH 14/94] move large mock corpus --- .../mock_corpora => corpora_test/large}/large_mock_corpus.py | 0 backend/ianalyzer/settings_test.py | 2 +- backend/visualization/conftest.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename backend/{visualization/tests/mock_corpora => corpora_test/large}/large_mock_corpus.py (100%) diff --git a/backend/visualization/tests/mock_corpora/large_mock_corpus.py b/backend/corpora_test/large/large_mock_corpus.py similarity index 100% rename from backend/visualization/tests/mock_corpora/large_mock_corpus.py rename to backend/corpora_test/large/large_mock_corpus.py diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 6742ba03d..808ea16af 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -9,7 +9,7 @@ def test_corpus_path(*path): CORPORA = { 'small-mock-corpus': test_corpus_path('small', 'small_mock_corpus.py'), - 'large-mock-corpus': path_in_testdir('visualization', 'mock_corpora', 'large_mock_corpus.py'), + 'large-mock-corpus': test_corpus_path('large', 'large_mock_corpus.py'), 'multilingual-mock-corpus': path_in_testdir('download', 'mock_corpora', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), 'media-mock-corpus': path_in_testdir('media', 'media_mock_corpus.py'), diff --git a/backend/visualization/conftest.py b/backend/visualization/conftest.py index 5472630cd..e9ff755cc 100644 --- a/backend/visualization/conftest.py +++ b/backend/visualization/conftest.py @@ -4,7 +4,7 @@ from conftest import index_test_corpus, clear_test_corpus from corpora_test.small.small_mock_corpus import SPECS as SMALL_MOCK_CORPUS_SPECS -from visualization.tests.mock_corpora.large_mock_corpus import SPECS as LARGE_MOCK_CORPUS_SPECS +from corpora_test.large.large_mock_corpus import SPECS as LARGE_MOCK_CORPUS_SPECS here = os.path.abspath(os.path.dirname(__file__)) From 667da67e9bdba88ff87169bc0c4a77034fb38704 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:26:48 +0200 Subject: [PATCH 15/94] move mixed language mock corpus --- .../mixed_language}/multilingual_mock_corpus.py | 2 +- .../mixed_language/source_data}/data.csv | 0 backend/download/conftest.py | 2 +- backend/ianalyzer/settings_test.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename backend/{download/tests/mock_corpora => corpora_test/mixed_language}/multilingual_mock_corpus.py (95%) rename backend/{download/tests/mock_corpora/sources_mixed_language => corpora_test/mixed_language/source_data}/data.csv (100%) diff --git a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py b/backend/corpora_test/mixed_language/multilingual_mock_corpus.py similarity index 95% rename from backend/download/tests/mock_corpora/multilingual_mock_corpus.py rename to backend/corpora_test/mixed_language/multilingual_mock_corpus.py index ff619532c..6d2bdc9da 100644 --- a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py +++ b/backend/corpora_test/mixed_language/multilingual_mock_corpus.py @@ -16,7 +16,7 @@ class MultilingualMockCorpus(CSVCorpusDefinition): min_date = datetime(year=2000, month=1, day=1) max_date = datetime(year=2022, month=12, day=31) es_index = 'ianalyzer-mixed-language-mock-corpus' - data_directory = os.path.join(here, 'sources_mixed_language') + data_directory = os.path.join(here, 'source_data') languages = ['sv', 'de'] category = 'book' diff --git a/backend/download/tests/mock_corpora/sources_mixed_language/data.csv b/backend/corpora_test/mixed_language/source_data/data.csv similarity index 100% rename from backend/download/tests/mock_corpora/sources_mixed_language/data.csv rename to backend/corpora_test/mixed_language/source_data/data.csv diff --git a/backend/download/conftest.py b/backend/download/conftest.py index 7fc818a25..913f1dcdc 100644 --- a/backend/download/conftest.py +++ b/backend/download/conftest.py @@ -1,6 +1,6 @@ import pytest import os -from download.tests.mock_corpora.multilingual_mock_corpus import SPECS as ML_MOCK_CORPUS_SPECS +from corpora_test.mixed_language.multilingual_mock_corpus import SPECS as ML_MOCK_CORPUS_SPECS from addcorpus.conftest import basic_corpus from es.conftest import basic_corpus_index from visualization.conftest import small_mock_corpus, large_mock_corpus, index_small_mock_corpus, \ diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 808ea16af..05ebb0b53 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -10,7 +10,7 @@ def test_corpus_path(*path): CORPORA = { 'small-mock-corpus': test_corpus_path('small', 'small_mock_corpus.py'), 'large-mock-corpus': test_corpus_path('large', 'large_mock_corpus.py'), - 'multilingual-mock-corpus': path_in_testdir('download', 'mock_corpora', 'multilingual_mock_corpus.py'), + 'multilingual-mock-corpus': test_corpus_path('mixed_language', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), 'media-mock-corpus': path_in_testdir('media', 'media_mock_corpus.py'), 'mock-csv-corpus': test_corpus_path('csv', 'mock_csv_corpus.py'), From b64eaf55ef8a69393734411796ba1dd99a362d59 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:29:01 +0200 Subject: [PATCH 16/94] move media mock corpus --- .../media}/media_mock_corpus.py | 2 +- .../media/source_data}/example.csv | 0 .../media/source_data}/images/ghost.png | Bin .../media/source_data}/images/hamlet.png | Bin backend/ianalyzer/settings_test.py | 2 +- 5 files changed, 2 insertions(+), 2 deletions(-) rename backend/{media/tests => corpora_test/media}/media_mock_corpus.py (91%) rename backend/{media/tests/example_data => corpora_test/media/source_data}/example.csv (100%) rename backend/{media/tests/example_data => corpora_test/media/source_data}/images/ghost.png (100%) rename backend/{media/tests/example_data => corpora_test/media/source_data}/images/hamlet.png (100%) diff --git a/backend/media/tests/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py similarity index 91% rename from backend/media/tests/media_mock_corpus.py rename to backend/corpora_test/media/media_mock_corpus.py index 058fd8849..1a8a257b0 100644 --- a/backend/media/tests/media_mock_corpus.py +++ b/backend/corpora_test/media/media_mock_corpus.py @@ -6,7 +6,7 @@ here = os.path.abspath(os.path.dirname(__file__)) class MediaMockCorpus(MockCSVCorpus): - data_directory = os.path.join(here, 'example_data') + data_directory = os.path.join(here, 'source_data') scan_image_type = 'image/png' citation_page = None diff --git a/backend/media/tests/example_data/example.csv b/backend/corpora_test/media/source_data/example.csv similarity index 100% rename from backend/media/tests/example_data/example.csv rename to backend/corpora_test/media/source_data/example.csv diff --git a/backend/media/tests/example_data/images/ghost.png b/backend/corpora_test/media/source_data/images/ghost.png similarity index 100% rename from backend/media/tests/example_data/images/ghost.png rename to backend/corpora_test/media/source_data/images/ghost.png diff --git a/backend/media/tests/example_data/images/hamlet.png b/backend/corpora_test/media/source_data/images/hamlet.png similarity index 100% rename from backend/media/tests/example_data/images/hamlet.png rename to backend/corpora_test/media/source_data/images/hamlet.png diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 05ebb0b53..10212f2d2 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -12,7 +12,7 @@ def test_corpus_path(*path): 'large-mock-corpus': test_corpus_path('large', 'large_mock_corpus.py'), 'multilingual-mock-corpus': test_corpus_path('mixed_language', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), - 'media-mock-corpus': path_in_testdir('media', 'media_mock_corpus.py'), + 'media-mock-corpus': test_corpus_path('media', 'media_mock_corpus.py'), 'mock-csv-corpus': test_corpus_path('csv', 'mock_csv_corpus.py'), 'mock-basic-corpus': test_corpus_path('csv', 'mock_basic_corpus.py'), 'wordmodels-mock-corpus': path_in_testdir('wordmodels', 'mock-corpus', 'mock_corpus.py'), From 3ef166c034bd642007b049f0cfe0040e0370350d Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:31:15 +0200 Subject: [PATCH 17/94] move tag mock corpus --- .../tests/data => corpora_test/tag/source_data}/test_data.csv | 0 backend/{tag/tests => corpora_test/tag}/tag_mock_corpus.py | 4 ++-- backend/ianalyzer/settings_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename backend/{tag/tests/data => corpora_test/tag/source_data}/test_data.csv (100%) rename backend/{tag/tests => corpora_test/tag}/tag_mock_corpus.py (92%) diff --git a/backend/tag/tests/data/test_data.csv b/backend/corpora_test/tag/source_data/test_data.csv similarity index 100% rename from backend/tag/tests/data/test_data.csv rename to backend/corpora_test/tag/source_data/test_data.csv diff --git a/backend/tag/tests/tag_mock_corpus.py b/backend/corpora_test/tag/tag_mock_corpus.py similarity index 92% rename from backend/tag/tests/tag_mock_corpus.py rename to backend/corpora_test/tag/tag_mock_corpus.py index 696b9dcc4..836981ebc 100644 --- a/backend/tag/tests/tag_mock_corpus.py +++ b/backend/corpora_test/tag/tag_mock_corpus.py @@ -14,9 +14,9 @@ class TaggingMockCorpus(CSVCorpusDefinition): es_index = 'tagging-mock-corpus' min_date = datetime.datetime(year=1, month=1, day=1) max_date = datetime.datetime(year=2022, month=12, day=31) - data_directory = os.path.join(here, 'data') + data_directory = os.path.join(here, 'source_data') - def sources(self, start, end): + def sources(self, *args, **kwargs): return ( (os.path.join(self.data_directory, file), {}) for file in os.listdir(self.data_directory) diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 10212f2d2..028e149ce 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -16,7 +16,7 @@ def test_corpus_path(*path): 'mock-csv-corpus': test_corpus_path('csv', 'mock_csv_corpus.py'), 'mock-basic-corpus': test_corpus_path('csv', 'mock_basic_corpus.py'), 'wordmodels-mock-corpus': path_in_testdir('wordmodels', 'mock-corpus', 'mock_corpus.py'), - 'tagging-mock-corpus': path_in_testdir('tag', 'tag_mock_corpus.py'), + 'tagging-mock-corpus': test_corpus_path('tag', 'tag_mock_corpus.py'), } TIMES_DATA = path_in_testdir('addcorpus', '../python_corpora/tests') From 39327ef7777ffbb2f7cefc7891da10fba8346fd8 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:33:52 +0200 Subject: [PATCH 18/94] move wordmodels mock corpus --- .../wordmodels}/mock-word-models/model_1810_1839.wv | Bin .../wordmodels}/mock-word-models/model_1840_1869.wv | Bin .../wordmodels}/mock-word-models/model_1870_1899.wv | Bin .../wordmodels}/wm/documentation.md | 0 .../wordmodels/wm_mock_corpus.py} | 0 backend/ianalyzer/settings_test.py | 2 +- 6 files changed, 1 insertion(+), 1 deletion(-) rename backend/{wordmodels/tests/mock-corpus => corpora_test/wordmodels}/mock-word-models/model_1810_1839.wv (100%) rename backend/{wordmodels/tests/mock-corpus => corpora_test/wordmodels}/mock-word-models/model_1840_1869.wv (100%) rename backend/{wordmodels/tests/mock-corpus => corpora_test/wordmodels}/mock-word-models/model_1870_1899.wv (100%) rename backend/{wordmodels/tests/mock-corpus => corpora_test/wordmodels}/wm/documentation.md (100%) rename backend/{wordmodels/tests/mock-corpus/mock_corpus.py => corpora_test/wordmodels/wm_mock_corpus.py} (100%) diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.wv b/backend/corpora_test/wordmodels/mock-word-models/model_1810_1839.wv similarity index 100% rename from backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.wv rename to backend/corpora_test/wordmodels/mock-word-models/model_1810_1839.wv diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.wv b/backend/corpora_test/wordmodels/mock-word-models/model_1840_1869.wv similarity index 100% rename from backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.wv rename to backend/corpora_test/wordmodels/mock-word-models/model_1840_1869.wv diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.wv b/backend/corpora_test/wordmodels/mock-word-models/model_1870_1899.wv similarity index 100% rename from backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.wv rename to backend/corpora_test/wordmodels/mock-word-models/model_1870_1899.wv diff --git a/backend/wordmodels/tests/mock-corpus/wm/documentation.md b/backend/corpora_test/wordmodels/wm/documentation.md similarity index 100% rename from backend/wordmodels/tests/mock-corpus/wm/documentation.md rename to backend/corpora_test/wordmodels/wm/documentation.md diff --git a/backend/wordmodels/tests/mock-corpus/mock_corpus.py b/backend/corpora_test/wordmodels/wm_mock_corpus.py similarity index 100% rename from backend/wordmodels/tests/mock-corpus/mock_corpus.py rename to backend/corpora_test/wordmodels/wm_mock_corpus.py diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 028e149ce..cd0400361 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -15,7 +15,7 @@ def test_corpus_path(*path): 'media-mock-corpus': test_corpus_path('media', 'media_mock_corpus.py'), 'mock-csv-corpus': test_corpus_path('csv', 'mock_csv_corpus.py'), 'mock-basic-corpus': test_corpus_path('csv', 'mock_basic_corpus.py'), - 'wordmodels-mock-corpus': path_in_testdir('wordmodels', 'mock-corpus', 'mock_corpus.py'), + 'wordmodels-mock-corpus': test_corpus_path('wordmodels', 'wm_mock_corpus.py'), 'tagging-mock-corpus': test_corpus_path('tag', 'tag_mock_corpus.py'), } From 365f26876a5a54a887d2aef9cab39ceff4443d04 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:35:36 +0200 Subject: [PATCH 19/94] clarify paths --- .../python_corpora/tests/test_save_corpus.py | 2 +- backend/addcorpus/tests/test_citation.py | 2 +- backend/addcorpus/tests/test_csvcorpus.py | 2 +- backend/addcorpus/tests/test_reader.py | 2 +- .../{csv => basic}/citation/citation.md | 0 .../{csv => basic}/description/mock-csv-corpus.md | 0 .../corpora_test/{csv => basic}/images/corpus.jpg | Bin .../{csv => basic}/mock_basic_corpus.py | 2 +- .../corpora_test/{csv => basic}/mock_csv_corpus.py | 0 .../{csv => basic}/source_data/example.csv | 0 backend/corpora_test/media/media_mock_corpus.py | 2 +- backend/ianalyzer/settings_test.py | 10 +++------- 12 files changed, 9 insertions(+), 13 deletions(-) rename backend/corpora_test/{csv => basic}/citation/citation.md (100%) rename backend/corpora_test/{csv => basic}/description/mock-csv-corpus.md (100%) rename backend/corpora_test/{csv => basic}/images/corpus.jpg (100%) rename backend/corpora_test/{csv => basic}/mock_basic_corpus.py (55%) rename backend/corpora_test/{csv => basic}/mock_csv_corpus.py (100%) rename backend/corpora_test/{csv => basic}/source_data/example.csv (100%) diff --git a/backend/addcorpus/python_corpora/tests/test_save_corpus.py b/backend/addcorpus/python_corpora/tests/test_save_corpus.py index ddf8fb55c..ff546d2ce 100644 --- a/backend/addcorpus/python_corpora/tests/test_save_corpus.py +++ b/backend/addcorpus/python_corpora/tests/test_save_corpus.py @@ -1,7 +1,7 @@ import sys import pytest from django.conf import settings -from corpora_test.csv.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import MockCSVCorpus from addcorpus.models import Corpus, CorpusConfiguration from addcorpus.python_corpora.save_corpus import (_save_field_in_database, load_and_save_all_corpora, _save_or_skip_corpus diff --git a/backend/addcorpus/tests/test_citation.py b/backend/addcorpus/tests/test_citation.py index f246c3292..7a0f24fd5 100644 --- a/backend/addcorpus/tests/test_citation.py +++ b/backend/addcorpus/tests/test_citation.py @@ -15,7 +15,7 @@ @pytest.fixture() def citation_template(settings): - path = os.path.join(settings.BASE_DIR, 'corpora_test', 'csv', 'citation', 'citation.md') + path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'citation', 'citation.md') with open(path) as f: return f.read() diff --git a/backend/addcorpus/tests/test_csvcorpus.py b/backend/addcorpus/tests/test_csvcorpus.py index 51c5529b4..418b97155 100644 --- a/backend/addcorpus/tests/test_csvcorpus.py +++ b/backend/addcorpus/tests/test_csvcorpus.py @@ -1,4 +1,4 @@ -from corpora_test.csv.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import MockCSVCorpus import os here = os.path.abspath(os.path.dirname(__file__)) diff --git a/backend/addcorpus/tests/test_reader.py b/backend/addcorpus/tests/test_reader.py index 16e7ede09..7d25cc429 100644 --- a/backend/addcorpus/tests/test_reader.py +++ b/backend/addcorpus/tests/test_reader.py @@ -18,7 +18,7 @@ def test_make_reader_python(mock_corpus): def test_make_reader_json(json_mock_corpus): - data_dir = os.path.join(settings.BASE_DIR, 'corpora_test', 'csv', 'source_data') + data_dir = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'source_data') json_mock_corpus.configuration.data_directory = data_dir json_mock_corpus.configuration.save() reader = make_reader(json_mock_corpus) diff --git a/backend/corpora_test/csv/citation/citation.md b/backend/corpora_test/basic/citation/citation.md similarity index 100% rename from backend/corpora_test/csv/citation/citation.md rename to backend/corpora_test/basic/citation/citation.md diff --git a/backend/corpora_test/csv/description/mock-csv-corpus.md b/backend/corpora_test/basic/description/mock-csv-corpus.md similarity index 100% rename from backend/corpora_test/csv/description/mock-csv-corpus.md rename to backend/corpora_test/basic/description/mock-csv-corpus.md diff --git a/backend/corpora_test/csv/images/corpus.jpg b/backend/corpora_test/basic/images/corpus.jpg similarity index 100% rename from backend/corpora_test/csv/images/corpus.jpg rename to backend/corpora_test/basic/images/corpus.jpg diff --git a/backend/corpora_test/csv/mock_basic_corpus.py b/backend/corpora_test/basic/mock_basic_corpus.py similarity index 55% rename from backend/corpora_test/csv/mock_basic_corpus.py rename to backend/corpora_test/basic/mock_basic_corpus.py index 0ef6534a6..daacc7fc4 100644 --- a/backend/corpora_test/csv/mock_basic_corpus.py +++ b/backend/corpora_test/basic/mock_basic_corpus.py @@ -1,4 +1,4 @@ -from corpora_test.csv.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import MockCSVCorpus class MockBasicCorpus(MockCSVCorpus): diff --git a/backend/corpora_test/csv/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py similarity index 100% rename from backend/corpora_test/csv/mock_csv_corpus.py rename to backend/corpora_test/basic/mock_csv_corpus.py diff --git a/backend/corpora_test/csv/source_data/example.csv b/backend/corpora_test/basic/source_data/example.csv similarity index 100% rename from backend/corpora_test/csv/source_data/example.csv rename to backend/corpora_test/basic/source_data/example.csv diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py index 1a8a257b0..db80fb9b0 100644 --- a/backend/corpora_test/media/media_mock_corpus.py +++ b/backend/corpora_test/media/media_mock_corpus.py @@ -1,6 +1,6 @@ import os -from corpora_test.csv.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import MockCSVCorpus from media.media_url import media_url here = os.path.abspath(os.path.dirname(__file__)) diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index cd0400361..f1199a51f 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -1,9 +1,5 @@ from ianalyzer.settings import * - -def path_in_testdir(app, *path_from_app_tests): - return os.path.join(BASE_DIR, app, 'tests', *path_from_app_tests) - def test_corpus_path(*path): return os.path.join(BASE_DIR, 'corpora_test', *path) @@ -13,13 +9,13 @@ def test_corpus_path(*path): 'multilingual-mock-corpus': test_corpus_path('mixed_language', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), 'media-mock-corpus': test_corpus_path('media', 'media_mock_corpus.py'), - 'mock-csv-corpus': test_corpus_path('csv', 'mock_csv_corpus.py'), - 'mock-basic-corpus': test_corpus_path('csv', 'mock_basic_corpus.py'), + 'mock-csv-corpus': test_corpus_path('basic', 'mock_csv_corpus.py'), + 'mock-basic-corpus': test_corpus_path('basic', 'mock_basic_corpus.py'), 'wordmodels-mock-corpus': test_corpus_path('wordmodels', 'wm_mock_corpus.py'), 'tagging-mock-corpus': test_corpus_path('tag', 'tag_mock_corpus.py'), } -TIMES_DATA = path_in_testdir('addcorpus', '../python_corpora/tests') +TIMES_DATA = os.path.join(BASE_DIR, 'addcorpus', 'python_corpora', 'tests') TIMES_ES_INDEX = 'times-test' SERVERS['default']['index_prefix'] = 'test' From 1d666e90b1d36116ab2f6721ec8b856067ec641b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 12:52:12 +0200 Subject: [PATCH 20/94] add docstrings to mock corpora --- backend/corpora_test/basic/mock_basic_corpus.py | 3 +++ backend/corpora_test/basic/mock_csv_corpus.py | 10 +++++++++- backend/corpora_test/large/large_mock_corpus.py | 6 +++++- backend/corpora_test/media/media_mock_corpus.py | 4 ++++ .../mixed_language/multilingual_mock_corpus.py | 9 ++++++++- backend/corpora_test/small/small_mock_corpus.py | 10 ++++++++++ backend/corpora_test/tag/tag_mock_corpus.py | 6 ++++++ backend/corpora_test/wordmodels/wm_mock_corpus.py | 8 ++++++-- 8 files changed, 51 insertions(+), 5 deletions(-) diff --git a/backend/corpora_test/basic/mock_basic_corpus.py b/backend/corpora_test/basic/mock_basic_corpus.py index daacc7fc4..44d8c354c 100644 --- a/backend/corpora_test/basic/mock_basic_corpus.py +++ b/backend/corpora_test/basic/mock_basic_corpus.py @@ -2,5 +2,8 @@ class MockBasicCorpus(MockCSVCorpus): + ''' + Same as the basic CSV corpus but with a different name. + ''' es_index = 'basic-corpus-index' diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py index 68686fa06..ce0995311 100644 --- a/backend/corpora_test/basic/mock_csv_corpus.py +++ b/backend/corpora_test/basic/mock_csv_corpus.py @@ -6,7 +6,15 @@ here = os.path.abspath(os.path.dirname(__file__)) class MockCSVCorpus(CSVCorpusDefinition): - """Example CSV corpus class for testing""" + ''' + Basic CSV corpus. + + Includes: + - a tiny CSV dataset to test source extraction. + - documentation pages + + Also suitable as a base class to test more specific settings. + ''' title = "Example" description = "Example corpus" diff --git a/backend/corpora_test/large/large_mock_corpus.py b/backend/corpora_test/large/large_mock_corpus.py index ea85fb650..ba59f8f3c 100644 --- a/backend/corpora_test/large/large_mock_corpus.py +++ b/backend/corpora_test/large/large_mock_corpus.py @@ -24,7 +24,11 @@ def generate_text(): class LargeMockCorpus(CorpusDefinition): ''' - For testing the download limit: a mock corpus that contains over + Corpus with a large dataset (> 10.000 documents). + + Documents are small and randomly generated. + + Useful for testing downloads and full data visualisations that need to go past 10.000 documents. ''' diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py index db80fb9b0..e965ca660 100644 --- a/backend/corpora_test/media/media_mock_corpus.py +++ b/backend/corpora_test/media/media_mock_corpus.py @@ -6,6 +6,10 @@ here = os.path.abspath(os.path.dirname(__file__)) class MediaMockCorpus(MockCSVCorpus): + ''' + Test corpus that includes image attachments to documents. + ''' + data_directory = os.path.join(here, 'source_data') scan_image_type = 'image/png' citation_page = None diff --git a/backend/corpora_test/mixed_language/multilingual_mock_corpus.py b/backend/corpora_test/mixed_language/multilingual_mock_corpus.py index 6d2bdc9da..d8e057042 100644 --- a/backend/corpora_test/mixed_language/multilingual_mock_corpus.py +++ b/backend/corpora_test/mixed_language/multilingual_mock_corpus.py @@ -10,8 +10,15 @@ here = os.path.abspath(os.path.dirname(__file__)) class MultilingualMockCorpus(CSVCorpusDefinition): + ''' + Corpus that includes multiple languages. + + The source data of this corpus includes diacritics, so this corpus is useful + for testing encoding. + ''' + title = 'Multilingual Mock Corpus' - description = 'A mixed-language corpus. Especially useful for testing character encoding' + description = 'A mixed-language corpus.' visualize = [] min_date = datetime(year=2000, month=1, day=1) max_date = datetime(year=2022, month=12, day=31) diff --git a/backend/corpora_test/small/small_mock_corpus.py b/backend/corpora_test/small/small_mock_corpus.py index 0a9270df4..8614e7b91 100644 --- a/backend/corpora_test/small/small_mock_corpus.py +++ b/backend/corpora_test/small/small_mock_corpus.py @@ -12,6 +12,13 @@ here = os.path.abspath(os.path.dirname(__file__)) class SmallMockCorpus(CSVCorpusDefinition): + ''' + CSV corpus with a small dataset to test queries and aggregations. + + Has multiple field types but a small number of documents, so you can test + complex queries and visualisations. + ''' + title = 'Mock Corpus' description = 'Corpus for testing' visualize = [] @@ -66,3 +73,6 @@ def sources(self, *args, **kwargs): 'example_query': 'to', 'content_field': 'content', } +''' +Specifications to test search results in this corpus. +''' diff --git a/backend/corpora_test/tag/tag_mock_corpus.py b/backend/corpora_test/tag/tag_mock_corpus.py index 836981ebc..7b4801992 100644 --- a/backend/corpora_test/tag/tag_mock_corpus.py +++ b/backend/corpora_test/tag/tag_mock_corpus.py @@ -9,6 +9,12 @@ class TaggingMockCorpus(CSVCorpusDefinition): + ''' + Mock corpus for tagging. + + Includes an `id` field which makes it easier to test expectations for tags. + ''' + title = 'Tagging Mock Corpus' description = 'Mock corpus for tagging' es_index = 'tagging-mock-corpus' diff --git a/backend/corpora_test/wordmodels/wm_mock_corpus.py b/backend/corpora_test/wordmodels/wm_mock_corpus.py index 5998065d3..0ae0f8ec6 100644 --- a/backend/corpora_test/wordmodels/wm_mock_corpus.py +++ b/backend/corpora_test/wordmodels/wm_mock_corpus.py @@ -6,8 +6,12 @@ here = abspath(dirname(__file__)) class WordmodelsMockCorpus(CorpusDefinition): - title = "Mock corpus with word models represented as Keyed Vectors" - description = "Mock corpus for testing word models, saved as gensim Keyed Vectors" + ''' + Corpus with diachronic word models. + ''' + + title = "Word model corpus" + description = "Mock corpus for testing word models" es_index = 'nothing' min_date = datetime.datetime(year=1810, month=1, day=1) max_date = datetime.datetime(year=1899, month=12, day=31) From 6d073eb60ede736a13a5e010270029576f0ce80d Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 13:36:18 +0200 Subject: [PATCH 21/94] make json and python mock corpora equivalent --- backend/addcorpus/tests/test_corpus_views.py | 2 +- backend/addcorpus/tests/test_csvcorpus.py | 33 +++++++++++-------- backend/addcorpus/tests/test_reader.py | 4 +-- backend/conftest.py | 2 +- .../corpora_test/{ => basic}/mock_corpus.json | 0 backend/corpora_test/basic/mock_csv_corpus.py | 24 +++++++++----- 6 files changed, 38 insertions(+), 27 deletions(-) rename backend/corpora_test/{ => basic}/mock_corpus.json (100%) diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index 1397af66e..150c4d1b7 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -72,7 +72,7 @@ def test_corpus_serialization(admin_client, mock_corpus): def test_corpus_not_publication_ready(admin_client, mock_corpus): corpus = Corpus.objects.get(name=mock_corpus) - content_field = corpus.configuration.fields.get(name='lines') + content_field = corpus.configuration.fields.get(name='line') content_field.display_type = 'text' content_field.save() diff --git a/backend/addcorpus/tests/test_csvcorpus.py b/backend/addcorpus/tests/test_csvcorpus.py index 418b97155..834640ace 100644 --- a/backend/addcorpus/tests/test_csvcorpus.py +++ b/backend/addcorpus/tests/test_csvcorpus.py @@ -7,38 +7,43 @@ target_documents = [ { 'character': 'HAMLET', - 'lines': ["Whither wilt thou lead me? Speak, I\'ll go no further."] + 'line': "Whither wilt thou lead me? Speak, I\'ll go no further." }, { 'character': 'GHOST', - 'lines': ["Mark me."] + 'line': "Mark me." }, { 'character': 'HAMLET', - 'lines': ["I will."] + 'line': "I will." }, { 'character': 'GHOST', - 'lines': [ - "My hour is almost come,", - "When I to sulph\'rous and tormenting flames", - "Must render up myself." - ] + 'line': "My hour is almost come,", + }, + { + 'character': 'GHOST', + 'line': "When I to sulph\'rous and tormenting flames", + }, + { + 'character': 'GHOST', + 'line': "Must render up myself.", }, { 'character': 'HAMLET', - 'lines': ["Alas, poor ghost!"] + 'line': "Alas, poor ghost!", + }, + { + 'character': 'GHOST', + 'line': "Pity me not, but lend thy serious hearing", }, { 'character': 'GHOST', - 'lines': [ - "Pity me not, but lend thy serious hearing", - "To what I shall unfold." - ] + 'line': "To what I shall unfold.", }, { 'character': 'HAMLET', - 'lines': ["Speak, I am bound to hear."] + 'line': "Speak, I am bound to hear." }, ] diff --git a/backend/addcorpus/tests/test_reader.py b/backend/addcorpus/tests/test_reader.py index 7d25cc429..e2d8ab0e9 100644 --- a/backend/addcorpus/tests/test_reader.py +++ b/backend/addcorpus/tests/test_reader.py @@ -10,10 +10,10 @@ def test_make_reader_python(mock_corpus): reader = make_reader(corpus) docs = list(reader.documents()) # The number of lines differs because of different corpus configuration - assert len(docs) == 7 + assert len(docs) == 10 assert docs[0] == { 'character': 'HAMLET', - 'lines': ["Whither wilt thou lead me? Speak, I\'ll go no further."] + 'line': "Whither wilt thou lead me? Speak, I\'ll go no further." } diff --git a/backend/conftest.py b/backend/conftest.py index 1c173b652..bd9eda2f7 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -110,7 +110,7 @@ def add_mock_python_corpora_to_db(db, media_dir): @pytest.fixture() def json_corpus_data(): - path = os.path.join(settings.BASE_DIR, 'corpora_test', 'mock_corpus.json') + path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'mock_corpus.json') with open(path) as f: return json.load(f) diff --git a/backend/corpora_test/mock_corpus.json b/backend/corpora_test/basic/mock_corpus.json similarity index 100% rename from backend/corpora_test/mock_corpus.json rename to backend/corpora_test/basic/mock_corpus.json diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py index ce0995311..df52a74f7 100644 --- a/backend/corpora_test/basic/mock_csv_corpus.py +++ b/backend/corpora_test/basic/mock_csv_corpus.py @@ -1,5 +1,7 @@ from addcorpus.python_corpora.corpus import CSVCorpusDefinition, FieldDefinition from addcorpus.python_corpora.extract import CSV +from addcorpus.es_mappings import keyword_mapping, main_content_mapping +from addcorpus.python_corpora.filters import MultipleChoiceFilter import os import datetime @@ -24,8 +26,6 @@ class MockCSVCorpus(CSVCorpusDefinition): data_directory = os.path.join(here, 'source_data') citation_page = 'citation.md' - field_entry = 'character' - languages = ['en'] category = 'book' @@ -38,15 +38,21 @@ def sources(self, **kwargs): fields = [ FieldDefinition( - name = 'character', - extractor = CSV('character') + name='character', + display_name='Character', + description='Character speaking the line', + extractor = CSV('character'), + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter(), + results_overview=True, + visualizations=['resultscount', 'termfrequency'], ), FieldDefinition( - name = 'lines', + name = 'line', display_type = 'text_content', - extractor = CSV( - 'line', - multiple = True, - ) + extractor = CSV('line'), + es_mapping=main_content_mapping(), + results_overview=True, + visualizations=['wordcloud'], ) ] From 58bfb48e827dc95aa02b429798cdf79481e7a500 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 14:19:30 +0200 Subject: [PATCH 22/94] change indexing fixtures to function scope --- backend/conftest.py | 79 +++++++++++++++---- .../corpora_test/large/large_mock_corpus.py | 2 +- .../multilingual_mock_corpus.py | 2 +- .../corpora_test/small/small_mock_corpus.py | 2 +- backend/corpora_test/tag/tag_mock_corpus.py | 2 +- backend/download/conftest.py | 13 +-- backend/tag/conftest.py | 10 --- backend/tag/tests/test_tag_filter.py | 6 +- backend/tag/tests/test_views.py | 2 +- backend/visualization/conftest.py | 26 +----- .../visualization/tests/test_field_stats.py | 2 +- 11 files changed, 76 insertions(+), 70 deletions(-) diff --git a/backend/conftest.py b/backend/conftest.py index bd9eda2f7..674fd3d83 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -5,6 +5,7 @@ import pytest import requests from allauth.account.models import EmailAddress +from elasticsearch import Elasticsearch from addcorpus.json_corpora.import_json import import_json_corpus from ianalyzer.elasticsearch import elasticsearch @@ -101,6 +102,68 @@ def es_client(): return client +@pytest.fixture() +def small_mock_corpus() -> str: + return 'small-mock-corpus' + + +@pytest.fixture() +def large_mock_corpus() -> str: + return 'large-mock-corpus' + + +@pytest.fixture() +def ml_mock_corpus() -> str: + return 'multilingual-mock-corpus' + + +@pytest.fixture() +def tag_mock_corpus() -> str: + return 'tagging-mock-corpus' + +def _clear_test_indices(es_client: Elasticsearch): + response = es_client.indices.get(index='test-*') + for index in response.keys(): + es_client.indices.delete(index=index) + + +@pytest.fixture(scope='session') +def test_index_cleanup(es_client: Elasticsearch): + _clear_test_indices(es_client) + yield + _clear_test_indices(es_client) + + +def _index_test_corpus(es_client: Elasticsearch, corpus_name: str): + corpus = load_corpus_definition(corpus_name) + + if not es_client.indices.exists(index=corpus.es_index): + index.create(es_client, corpus, False, True, False) + index.populate(es_client, corpus_name, corpus) + # ES is "near real time", so give it a second before we start searching the index + sleep(2) + + +@pytest.fixture() +def index_small_mock_corpus(es_client: Elasticsearch, small_mock_corpus: str, test_index_cleanup): + _index_test_corpus(es_client, small_mock_corpus) + + +@pytest.fixture() +def index_large_mock_corpus(es_client: Elasticsearch, large_mock_corpus: str, test_index_cleanup): + _index_test_corpus(es_client, large_mock_corpus) + + +@pytest.fixture() +def index_ml_mock_corpus(es_client: Elasticsearch, ml_mock_corpus: str, test_index_cleanup): + _index_test_corpus(es_client, ml_mock_corpus) + + +@pytest.fixture() +def index_tag_mock_corpus(es_client: Elasticsearch, tag_mock_corpus: str, test_index_cleanup): + _index_test_corpus(es_client, tag_mock_corpus) + + # mock corpora @pytest.fixture(autouse=True) def add_mock_python_corpora_to_db(db, media_dir): @@ -119,19 +182,3 @@ def json_corpus_data(): def json_mock_corpus(db, json_corpus_data): # add json mock corpora to the database at the start of each test return import_json_corpus(json_corpus_data) - - -def index_test_corpus(es_client, corpus_name): - corpus = load_corpus_definition(corpus_name) - index.create(es_client, corpus, False, True, False) - index.populate(es_client, corpus_name, corpus) - - # ES is "near real time", so give it a second before we start searching the index - sleep(2) - -def clear_test_corpus(es_client, corpus_name): - corpus = load_corpus_definition(corpus_name) - index = corpus.es_index - # check existence in case teardown is executed more than once - if es_client.indices.exists(index = index): - es_client.indices.delete(index = index) diff --git a/backend/corpora_test/large/large_mock_corpus.py b/backend/corpora_test/large/large_mock_corpus.py index ba59f8f3c..960939b0c 100644 --- a/backend/corpora_test/large/large_mock_corpus.py +++ b/backend/corpora_test/large/large_mock_corpus.py @@ -37,7 +37,7 @@ class LargeMockCorpus(CorpusDefinition): visualize = [] min_date = datetime(year=1800, month=1, day=1) max_date = datetime(year=1899, month=12, day=31) - es_index = 'large-mock-corpus' + es_index = 'test-large-mock-corpus' data_directory = None languages = ['en'] category = 'book' diff --git a/backend/corpora_test/mixed_language/multilingual_mock_corpus.py b/backend/corpora_test/mixed_language/multilingual_mock_corpus.py index d8e057042..7649785bd 100644 --- a/backend/corpora_test/mixed_language/multilingual_mock_corpus.py +++ b/backend/corpora_test/mixed_language/multilingual_mock_corpus.py @@ -22,7 +22,7 @@ class MultilingualMockCorpus(CSVCorpusDefinition): visualize = [] min_date = datetime(year=2000, month=1, day=1) max_date = datetime(year=2022, month=12, day=31) - es_index = 'ianalyzer-mixed-language-mock-corpus' + es_index = 'test-mixed-language-mock-corpus' data_directory = os.path.join(here, 'source_data') languages = ['sv', 'de'] category = 'book' diff --git a/backend/corpora_test/small/small_mock_corpus.py b/backend/corpora_test/small/small_mock_corpus.py index 8614e7b91..15c830e7c 100644 --- a/backend/corpora_test/small/small_mock_corpus.py +++ b/backend/corpora_test/small/small_mock_corpus.py @@ -24,7 +24,7 @@ class SmallMockCorpus(CSVCorpusDefinition): visualize = [] min_date = datetime(year=1800, month=1, day=1) max_date = datetime(year=1899, month=12, day=31) - es_index = 'ianalyzer-mock-corpus' + es_index = 'test-mock-corpus' data_directory = os.path.join(here, 'source_data') languages = ['en'] category = 'book' diff --git a/backend/corpora_test/tag/tag_mock_corpus.py b/backend/corpora_test/tag/tag_mock_corpus.py index 7b4801992..efc796a29 100644 --- a/backend/corpora_test/tag/tag_mock_corpus.py +++ b/backend/corpora_test/tag/tag_mock_corpus.py @@ -17,7 +17,7 @@ class TaggingMockCorpus(CSVCorpusDefinition): title = 'Tagging Mock Corpus' description = 'Mock corpus for tagging' - es_index = 'tagging-mock-corpus' + es_index = 'test-tagging-mock-corpus' min_date = datetime.datetime(year=1, month=1, day=1) max_date = datetime.datetime(year=2022, month=12, day=31) data_directory = os.path.join(here, 'source_data') diff --git a/backend/download/conftest.py b/backend/download/conftest.py index 913f1dcdc..1e740531e 100644 --- a/backend/download/conftest.py +++ b/backend/download/conftest.py @@ -3,9 +3,8 @@ from corpora_test.mixed_language.multilingual_mock_corpus import SPECS as ML_MOCK_CORPUS_SPECS from addcorpus.conftest import basic_corpus from es.conftest import basic_corpus_index -from visualization.conftest import small_mock_corpus, large_mock_corpus, index_small_mock_corpus, \ - index_large_mock_corpus, small_mock_corpus_specs, large_mock_corpus_specs -from conftest import index_test_corpus, clear_test_corpus +from visualization.conftest import small_mock_corpus_specs, large_mock_corpus_specs + from visualization.query import MATCH_ALL from download import tasks @@ -42,13 +41,7 @@ def mock_corpus_specs(mock_corpus, small_mock_corpus, large_mock_corpus, ml_mock } return specs[mock_corpus] -@pytest.fixture(scope='session') -def index_ml_mock_corpus(es_client, ml_mock_corpus): - index_test_corpus(es_client, ml_mock_corpus) - yield ml_mock_corpus - clear_test_corpus(es_client, ml_mock_corpus) - -@pytest.fixture(scope='session') +@pytest.fixture() def index_mock_corpus(es_client, mock_corpus, index_small_mock_corpus, index_large_mock_corpus, index_ml_mock_corpus): yield mock_corpus diff --git a/backend/tag/conftest.py b/backend/tag/conftest.py index a7193c33c..d78310739 100644 --- a/backend/tag/conftest.py +++ b/backend/tag/conftest.py @@ -2,8 +2,6 @@ from django.contrib.auth.models import Group from addcorpus.models import Corpus from tag.models import DOCS_PER_TAG_LIMIT, Tag, TaggedDocument -from conftest import index_test_corpus, clear_test_corpus - @pytest.fixture(scope='session') def mock_corpus(): @@ -114,11 +112,3 @@ def multiple_tags(db, mock_corpus, auth_user): doc.tags.add(brilliant_tag) return [riveting_tag, brilliant_tag] - - -@pytest.fixture(scope='session') -def index_mock_corpus(mock_corpus, es_client): - index_test_corpus(es_client, mock_corpus) - yield mock_corpus - clear_test_corpus(es_client, mock_corpus) - diff --git a/backend/tag/tests/test_tag_filter.py b/backend/tag/tests/test_tag_filter.py index f84c72a12..e22fc72b3 100644 --- a/backend/tag/tests/test_tag_filter.py +++ b/backend/tag/tests/test_tag_filter.py @@ -7,13 +7,13 @@ def test_tag_document_ids(mock_corpus, auth_user_tag, tagged_documents): _, docs = tagged_documents assert len(tag_document_ids([auth_user_tag], mock_corpus)) == auth_user_tag.count -def test_tag_filter(mock_corpus, index_mock_corpus, auth_user_tag, tagged_documents): +def test_tag_filter(mock_corpus, index_tag_mock_corpus, auth_user_tag, tagged_documents): filter = tag_filter([auth_user_tag.id], mock_corpus) query = {'query': filter} results = search.search(mock_corpus, query) assert search.total_hits(results) == auth_user_tag.count -def test_search_with_tag(mock_corpus, index_mock_corpus, auth_user_tag, tagged_documents): +def test_search_with_tag(mock_corpus, index_tag_mock_corpus, auth_user_tag, tagged_documents): query = set_query_text(MATCH_ALL, 'text') results = search.search(mock_corpus, query) @@ -24,7 +24,7 @@ def test_search_with_tag(mock_corpus, index_mock_corpus, auth_user_tag, tagged_d results_with_tag = search.search(mock_corpus, query_with_tag) assert search.total_hits(results_with_tag) == 1 -def test_search_multiple_tags(mock_corpus, index_mock_corpus, multiple_tags): +def test_search_multiple_tags(mock_corpus, index_tag_mock_corpus, multiple_tags): ids = [tag.id for tag in multiple_tags] query = include_tag_filter(MATCH_ALL, ids, mock_corpus) results = search.search(mock_corpus, query) diff --git a/backend/tag/tests/test_views.py b/backend/tag/tests/test_views.py index 2910c9f12..976923473 100644 --- a/backend/tag/tests/test_views.py +++ b/backend/tag/tests/test_views.py @@ -180,7 +180,7 @@ def search_with_tag(client, corpus_name, tag_id): } return client.post(route, data, content_type = 'application/json') -def test_search_view_with_tag(auth_client, mock_corpus, auth_user_tag, tagged_documents, index_mock_corpus): +def test_search_view_with_tag(auth_client, mock_corpus, auth_user_tag, tagged_documents, index_tag_mock_corpus): response = search_with_tag(auth_client, mock_corpus, auth_user_tag.id) assert status.is_success(response.status_code) assert len(hits(response.data)) == auth_user_tag.count diff --git a/backend/visualization/conftest.py b/backend/visualization/conftest.py index e9ff755cc..59af92409 100644 --- a/backend/visualization/conftest.py +++ b/backend/visualization/conftest.py @@ -2,7 +2,6 @@ import os import random -from conftest import index_test_corpus, clear_test_corpus from corpora_test.small.small_mock_corpus import SPECS as SMALL_MOCK_CORPUS_SPECS from corpora_test.large.large_mock_corpus import SPECS as LARGE_MOCK_CORPUS_SPECS @@ -69,14 +68,6 @@ def es_client_k_hits(): ''' return MockClient(500) -@pytest.fixture(scope='session') -def small_mock_corpus(): - return 'small-mock-corpus' - -@pytest.fixture(scope='session') -def large_mock_corpus(scope='session'): - return 'large-mock-corpus' - @pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus'], scope='session') def mock_corpus(request): 'parametrised version of the mock corpus fixtures: runs with both' @@ -104,23 +95,8 @@ def mock_corpus_specs(mock_corpus, small_mock_corpus, large_mock_corpus, } return specs[mock_corpus] -@pytest.fixture(scope='session') -def index_small_mock_corpus(small_mock_corpus, es_client): - '''Create and populate an index for the small mock corpus.''' - - index_test_corpus(es_client, small_mock_corpus) - yield small_mock_corpus - clear_test_corpus(es_client, small_mock_corpus) -@pytest.fixture(scope='session') -def index_large_mock_corpus(large_mock_corpus, es_client): - '''Create and populate an index for the large mock corpus''' - - index_test_corpus(es_client, large_mock_corpus) - yield large_mock_corpus - clear_test_corpus(es_client, large_mock_corpus) - -@pytest.fixture(scope='module') +@pytest.fixture() def index_mock_corpus(mock_corpus, index_small_mock_corpus, index_large_mock_corpus): '''Create and populate an index for the mock corpus.''' diff --git a/backend/visualization/tests/test_field_stats.py b/backend/visualization/tests/test_field_stats.py index 46c4321f1..e18e456b5 100644 --- a/backend/visualization/tests/test_field_stats.py +++ b/backend/visualization/tests/test_field_stats.py @@ -11,7 +11,7 @@ def test_count(small_mock_corpus, es_client, index_small_mock_corpus, small_mock assert count_total(es_client, small_mock_corpus) == total_docs -def test_report(small_mock_corpus, es_client,index_small_mock_corpus, small_mock_corpus_specs): +def test_report(small_mock_corpus, es_client, index_small_mock_corpus, small_mock_corpus_specs): report = report_coverage(small_mock_corpus) assert report == { From 36951e58514d59fc8fdb59d61f3c0d5cc442b3a7 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 14:30:29 +0200 Subject: [PATCH 23/94] move + rename fixture for basic csv corpus --- backend/addcorpus/conftest.py | 8 ++--- .../python_corpora/tests/test_save_corpus.py | 30 +++++++++---------- backend/addcorpus/tests/test_corpus_access.py | 20 ++++++------- backend/addcorpus/tests/test_corpus_views.py | 30 +++++++++---------- backend/addcorpus/tests/test_reader.py | 4 +-- backend/conftest.py | 4 +++ 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/backend/addcorpus/conftest.py b/backend/addcorpus/conftest.py index 69eb6a333..7d9f5b01a 100644 --- a/backend/addcorpus/conftest.py +++ b/backend/addcorpus/conftest.py @@ -4,10 +4,10 @@ from addcorpus.models import Corpus @pytest.fixture() -def group_with_access(db, mock_corpus): +def group_with_access(db, basic_mock_corpus): '''Create a group with access to the mock corpus''' group = Group.objects.create(name='nice-users') - corpus = Corpus.objects.get(name=mock_corpus) + corpus = Corpus.objects.get(name=basic_mock_corpus) corpus.groups.add(group) corpus.save() yield group @@ -15,10 +15,6 @@ def group_with_access(db, mock_corpus): here = os.path.abspath(os.path.dirname(__file__)) -@pytest.fixture() -def mock_corpus(): - return 'mock-csv-corpus' - @pytest.fixture() def basic_corpus(): diff --git a/backend/addcorpus/python_corpora/tests/test_save_corpus.py b/backend/addcorpus/python_corpora/tests/test_save_corpus.py index ff546d2ce..cd1e5f585 100644 --- a/backend/addcorpus/python_corpora/tests/test_save_corpus.py +++ b/backend/addcorpus/python_corpora/tests/test_save_corpus.py @@ -41,39 +41,39 @@ def test_no_errors_when_saving_corpora(db, capsys): for line in captured.out.split('\n'): assert line == '' or line.startswith('Saved corpus:') -def test_saving_broken_corpus(db, mock_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def test_saving_broken_corpus(db, basic_mock_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) corpus_def = MockCSVCorpus() corpus_def.min_date = 'Not a valid date' - _save_or_skip_corpus(mock_corpus, corpus_def) + _save_or_skip_corpus(basic_mock_corpus, corpus_def) corpus.refresh_from_db() # expect the the corpus to be inactive now assert not corpus.active assert corpus.has_python_definition -def test_remove_corpus_from_settings(db, settings, mock_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def test_remove_corpus_from_settings(db, settings, basic_mock_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) assert corpus.active assert corpus.has_python_definition - path = settings.CORPORA.pop(mock_corpus) + path = settings.CORPORA.pop(basic_mock_corpus) load_and_save_all_corpora() corpus.refresh_from_db() assert not corpus.active assert not corpus.has_python_definition - settings.CORPORA[mock_corpus] = path + settings.CORPORA[basic_mock_corpus] = path load_and_save_all_corpora() corpus.refresh_from_db() assert corpus.active assert corpus.has_python_definition @pytest.fixture() -def deactivated_corpus(mock_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def deactivated_corpus(basic_mock_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) corpus.active = False corpus.save() @@ -82,8 +82,8 @@ def deactivated_corpus(mock_corpus): corpus.active = True corpus.save() -def test_save_field_definition(db, mock_corpus, deactivated_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def test_save_field_definition(db, basic_mock_corpus, deactivated_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) corpus_conf = corpus.configuration corpus_def = MockCSVCorpus() @@ -94,22 +94,22 @@ def test_save_field_definition(db, mock_corpus, deactivated_corpus): assert field assert field.name == field_def.name -def test_save_corpus_purity(db, mock_corpus): +def test_save_corpus_purity(db, basic_mock_corpus): ''' Test that saved corpus configurations only depend on the definition at that time, not on the currently saved state ''' - corpus = Corpus.objects.get(name=mock_corpus) + corpus = Corpus.objects.get(name=basic_mock_corpus) corpus_def = MockCSVCorpus() corpus_def.es_alias = 'test' - _save_or_skip_corpus(mock_corpus, corpus_def) + _save_or_skip_corpus(basic_mock_corpus, corpus_def) corpus.refresh_from_db() assert corpus.configuration.es_alias == 'test' corpus_def.es_alias = None - _save_or_skip_corpus(mock_corpus, corpus_def) + _save_or_skip_corpus(basic_mock_corpus, corpus_def) corpus.refresh_from_db() assert not corpus.configuration.es_alias diff --git a/backend/addcorpus/tests/test_corpus_access.py b/backend/addcorpus/tests/test_corpus_access.py index f5e6c843d..8cf0c801c 100644 --- a/backend/addcorpus/tests/test_corpus_access.py +++ b/backend/addcorpus/tests/test_corpus_access.py @@ -1,17 +1,17 @@ from users.models import CustomUser, CustomAnonymousUser -def test_access_through_group(db, mock_corpus, group_with_access): +def test_access_through_group(db, basic_mock_corpus, group_with_access): user = CustomUser.objects.create(username='nice-user', password='secret') user.groups.add(group_with_access) user.save() - assert user.has_access(mock_corpus) + assert user.has_access(basic_mock_corpus) -def test_superuser_access(mock_corpus, admin_user): - assert admin_user.has_access(mock_corpus) +def test_superuser_access(basic_mock_corpus, admin_user): + assert admin_user.has_access(basic_mock_corpus) -def test_no_corpus_access(db, mock_corpus): +def test_no_corpus_access(db, basic_mock_corpus): user = CustomUser.objects.create(username='bad-user', password='secret') - assert not user.has_access(mock_corpus) + assert not user.has_access(basic_mock_corpus) def test_basic_corpus_access(db, basic_corpus): @@ -20,7 +20,7 @@ def test_basic_corpus_access(db, basic_corpus): anon = CustomAnonymousUser() assert anon.has_access(basic_corpus) -def test_api_access(db, mock_corpus, group_with_access, auth_client, auth_user): +def test_api_access(db, basic_mock_corpus, group_with_access, auth_client, auth_user): # default: no access response = auth_client.get('/api/corpus/') assert len(response.data) == 0 @@ -30,9 +30,9 @@ def test_api_access(db, mock_corpus, group_with_access, auth_client, auth_user): auth_user.save response = auth_client.get('/api/corpus/') assert len(response.data) == 1 - assert response.data[0].get('name') == mock_corpus + assert response.data[0].get('name') == basic_mock_corpus -def test_superuser_api_access(admin_client, mock_corpus): +def test_superuser_api_access(admin_client, basic_mock_corpus): response = admin_client.get('/api/corpus/') assert response.status_code == 200 - assert any(corpus['name'] == mock_corpus for corpus in response.data) + assert any(corpus['name'] == basic_mock_corpus for corpus in response.data) diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index 150c4d1b7..54ffcb885 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -12,8 +12,8 @@ def test_no_corpora(db, settings, admin_client): assert status.is_success(response.status_code) assert response.data == [] -def test_corpus_documentation_view(admin_client, mock_corpus, settings): - response = admin_client.get(f'/api/corpus/documentation/{mock_corpus}/') +def test_corpus_documentation_view(admin_client, basic_mock_corpus, settings): + response = admin_client.get(f'/api/corpus/documentation/{basic_mock_corpus}/') assert response.status_code == 200 # should contain citation guidelines @@ -24,43 +24,43 @@ def test_corpus_documentation_view(admin_client, mock_corpus, settings): assert '{{ frontend_url }}' not in content assert settings.BASE_URL in content -def test_corpus_image_view(admin_client, mock_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def test_corpus_image_view(admin_client, basic_mock_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) assert not corpus.configuration.image - response = admin_client.get(f'/api/corpus/image/{mock_corpus}') + response = admin_client.get(f'/api/corpus/image/{basic_mock_corpus}') assert response.status_code == 200 corpus.configuration.image = 'corpus.jpg' corpus.configuration.save - response = admin_client.get(f'/api/corpus/image/{mock_corpus}') + response = admin_client.get(f'/api/corpus/image/{basic_mock_corpus}') assert response.status_code == 200 def test_nonexistent_corpus(admin_client): response = admin_client.get(f'/api/corpus/documentation/unknown-corpus/') assert response.status_code == 404 -def test_no_corpus_access(db, client, mock_corpus): +def test_no_corpus_access(db, client, basic_mock_corpus): '''Test a request from a user that should not have access to the corpus''' user = CustomUser.objects.create(username='bad-user', password='secret') client.force_login(user) - response = client.get(f'/api/corpus/documentation/{mock_corpus}/') + response = client.get(f'/api/corpus/documentation/{basic_mock_corpus}/') assert response.status_code == 403 -def test_corpus_documentation_unauthenticated(db, client, basic_corpus, mock_corpus): +def test_corpus_documentation_unauthenticated(db, client, basic_corpus, basic_mock_corpus): response = client.get( - f'/api/corpus/documentation/{mock_corpus}/') + f'/api/corpus/documentation/{basic_mock_corpus}/') assert response.status_code == 401 response = client.get( f'/api/corpus/documentation/{basic_corpus}/') assert response.status_code == 200 -def test_corpus_serialization(admin_client, mock_corpus): +def test_corpus_serialization(admin_client, basic_mock_corpus): response = admin_client.get('/api/corpus/') - corpus = next(c for c in response.data if c['name'] == mock_corpus) + corpus = next(c for c in response.data if c['name'] == basic_mock_corpus) assert corpus assert corpus['languages'] == ['English'] assert corpus['category'] == 'Books' @@ -70,11 +70,11 @@ def test_corpus_serialization(admin_client, mock_corpus): for property in secrets: assert property not in corpus -def test_corpus_not_publication_ready(admin_client, mock_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def test_corpus_not_publication_ready(admin_client, basic_mock_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) content_field = corpus.configuration.fields.get(name='line') content_field.display_type = 'text' content_field.save() response = admin_client.get('/api/corpus/') - corpus = not any(c['name'] == mock_corpus for c in response.data) + corpus = not any(c['name'] == basic_mock_corpus for c in response.data) diff --git a/backend/addcorpus/tests/test_reader.py b/backend/addcorpus/tests/test_reader.py index e2d8ab0e9..05775ec19 100644 --- a/backend/addcorpus/tests/test_reader.py +++ b/backend/addcorpus/tests/test_reader.py @@ -5,8 +5,8 @@ -def test_make_reader_python(mock_corpus): - corpus = Corpus.objects.get(name=mock_corpus) +def test_make_reader_python(basic_mock_corpus): + corpus = Corpus.objects.get(name=basic_mock_corpus) reader = make_reader(corpus) docs = list(reader.documents()) # The number of lines differs because of different corpus configuration diff --git a/backend/conftest.py b/backend/conftest.py index 674fd3d83..6762f0535 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -102,6 +102,10 @@ def es_client(): return client +@pytest.fixture() +def basic_mock_corpus() -> str: + return 'mock-csv-corpus' + @pytest.fixture() def small_mock_corpus() -> str: return 'small-mock-corpus' From f5c42c9a9a9af51ae1152b0fb30fc82e9cd95f08 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 14:49:27 +0200 Subject: [PATCH 24/94] remove duplicate mock corpus --- backend/addcorpus/conftest.py | 11 ----------- backend/addcorpus/tests/test_corpus_access.py | 6 +++--- backend/addcorpus/tests/test_corpus_views.py | 6 ++++-- backend/conftest.py | 16 ++++++++++++++++ backend/corpora_test/basic/mock_basic_corpus.py | 9 --------- backend/corpora_test/basic/mock_csv_corpus.py | 4 ++-- backend/download/conftest.py | 2 -- backend/download/tests/test_download_views.py | 6 +++--- backend/es/conftest.py | 5 ++--- backend/es/tests/test_es_forward.py | 5 ++--- backend/ianalyzer/settings_test.py | 1 - 11 files changed, 32 insertions(+), 39 deletions(-) delete mode 100644 backend/corpora_test/basic/mock_basic_corpus.py diff --git a/backend/addcorpus/conftest.py b/backend/addcorpus/conftest.py index 7d9f5b01a..6aff4e03f 100644 --- a/backend/addcorpus/conftest.py +++ b/backend/addcorpus/conftest.py @@ -14,14 +14,3 @@ def group_with_access(db, basic_mock_corpus): group.delete() here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture() -def basic_corpus(): - corpus_name = 'mock-basic-corpus' - basic_group = Group.objects.create(name='basic') - corpus = Corpus.objects.get(name=corpus_name) - corpus.groups.add(basic_group) - yield corpus_name - corpus.groups.remove(basic_group) - basic_group.delete() diff --git a/backend/addcorpus/tests/test_corpus_access.py b/backend/addcorpus/tests/test_corpus_access.py index 8cf0c801c..9e14a4e0d 100644 --- a/backend/addcorpus/tests/test_corpus_access.py +++ b/backend/addcorpus/tests/test_corpus_access.py @@ -14,11 +14,11 @@ def test_no_corpus_access(db, basic_mock_corpus): assert not user.has_access(basic_mock_corpus) -def test_basic_corpus_access(db, basic_corpus): +def test_public_corpus_access(db, basic_corpus_public): user = CustomUser.objects.create(username='new-user', password='secret') - assert user.has_access(basic_corpus) + assert user.has_access(basic_corpus_public) anon = CustomAnonymousUser() - assert anon.has_access(basic_corpus) + assert anon.has_access(basic_corpus_public) def test_api_access(db, basic_mock_corpus, group_with_access, auth_client, auth_user): # default: no access diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index 54ffcb885..525ec021c 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -50,12 +50,14 @@ def test_no_corpus_access(db, client, basic_mock_corpus): assert response.status_code == 403 -def test_corpus_documentation_unauthenticated(db, client, basic_corpus, basic_mock_corpus): +def test_corpus_documentation_unauthenticated(db, client, basic_mock_corpus): response = client.get( f'/api/corpus/documentation/{basic_mock_corpus}/') assert response.status_code == 401 + +def test_public_corpus_documentation_unauthenticated(db, client, basic_corpus_public): response = client.get( - f'/api/corpus/documentation/{basic_corpus}/') + f'/api/corpus/documentation/{basic_corpus_public}/') assert response.status_code == 200 def test_corpus_serialization(admin_client, basic_mock_corpus): diff --git a/backend/conftest.py b/backend/conftest.py index 6762f0535..bb4e61013 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -14,6 +14,8 @@ from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora from es import es_index as index from django.conf import settings +from django.contrib.auth.models import Group +from addcorpus.models import Corpus @pytest.fixture(autouse=True) @@ -75,6 +77,16 @@ def admin_client(client, admin_user, admin_credentials): yield client client.logout() +@pytest.fixture() +def basic_corpus_public(db, basic_mock_corpus): + basic_group = Group.objects.create(name='basic') + corpus = Corpus.objects.get(name=basic_mock_corpus) + corpus.groups.add(basic_group) + yield basic_mock_corpus + corpus.groups.remove(basic_group) + basic_group.delete() + + @pytest.fixture(scope='session') def connected_to_internet(): """ @@ -147,6 +159,10 @@ def _index_test_corpus(es_client: Elasticsearch, corpus_name: str): # ES is "near real time", so give it a second before we start searching the index sleep(2) +@pytest.fixture() +def index_basic_mock_corpus(es_client: Elasticsearch, basic_mock_corpus: str, test_index_cleanup): + _index_test_corpus(es_client, basic_mock_corpus) + @pytest.fixture() def index_small_mock_corpus(es_client: Elasticsearch, small_mock_corpus: str, test_index_cleanup): diff --git a/backend/corpora_test/basic/mock_basic_corpus.py b/backend/corpora_test/basic/mock_basic_corpus.py deleted file mode 100644 index 44d8c354c..000000000 --- a/backend/corpora_test/basic/mock_basic_corpus.py +++ /dev/null @@ -1,9 +0,0 @@ -from corpora_test.basic.mock_csv_corpus import MockCSVCorpus - - -class MockBasicCorpus(MockCSVCorpus): - ''' - Same as the basic CSV corpus but with a different name. - ''' - - es_index = 'basic-corpus-index' diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py index df52a74f7..84711f8a0 100644 --- a/backend/corpora_test/basic/mock_csv_corpus.py +++ b/backend/corpora_test/basic/mock_csv_corpus.py @@ -20,7 +20,7 @@ class MockCSVCorpus(CSVCorpusDefinition): title = "Example" description = "Example corpus" - es_index = 'nothing' + es_index = 'test-basic-corpus' min_date = datetime.datetime(year=1, month=1, day=1) max_date = datetime.datetime(year=2022, month=12, day=31) data_directory = os.path.join(here, 'source_data') @@ -29,7 +29,7 @@ class MockCSVCorpus(CSVCorpusDefinition): languages = ['en'] category = 'book' - def sources(self, **kwargs): + def sources(self, *args, **kwargs): for filename in os.listdir(self.data_directory): full_path = os.path.join(self.data_directory, filename) yield full_path, { diff --git a/backend/download/conftest.py b/backend/download/conftest.py index 1e740531e..45598d882 100644 --- a/backend/download/conftest.py +++ b/backend/download/conftest.py @@ -1,8 +1,6 @@ import pytest import os from corpora_test.mixed_language.multilingual_mock_corpus import SPECS as ML_MOCK_CORPUS_SPECS -from addcorpus.conftest import basic_corpus -from es.conftest import basic_corpus_index from visualization.conftest import small_mock_corpus_specs, large_mock_corpus_specs from visualization.query import MATCH_ALL diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py index 9ae1799fd..ef1c6411a 100644 --- a/backend/download/tests/test_download_views.py +++ b/backend/download/tests/test_download_views.py @@ -230,12 +230,12 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock assert len(rows) == 1 -def test_unauthenticated_download(db, client, basic_corpus, basic_corpus_index): +def test_unauthenticated_download(db, client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus): download_request_json = { - 'corpus': basic_corpus, + 'corpus': basic_mock_corpus, 'es_query': mock_match_all_query(), 'fields': ['date', 'content'], - 'route': f"/search/{basic_corpus}", + 'route': f"/search/{basic_mock_corpus}", 'encoding': 'utf-8' } response = client.post('/api/download/search_results', diff --git a/backend/es/conftest.py b/backend/es/conftest.py index c2a3056e6..0dd97acc5 100644 --- a/backend/es/conftest.py +++ b/backend/es/conftest.py @@ -2,7 +2,6 @@ from time import sleep from django.contrib.auth.models import Group -from addcorpus.conftest import basic_corpus from addcorpus.python_corpora.load_corpus import load_corpus_definition from addcorpus.models import Corpus from es import es_index @@ -41,8 +40,8 @@ def es_forward_client(es_client, mock_corpus): @pytest.fixture() -def basic_corpus_index(es_client, basic_corpus): - corpus = load_corpus_definition(basic_corpus) +def empty_corpus_index(es_client, basic_mock_corpus): + corpus = load_corpus_definition(basic_mock_corpus) es_index.create(es_client, corpus, False, True, False) yield es_client es_client.indices.delete(index=corpus.es_index) diff --git a/backend/es/tests/test_es_forward.py b/backend/es/tests/test_es_forward.py index 30fb3add6..7dd954c83 100644 --- a/backend/es/tests/test_es_forward.py +++ b/backend/es/tests/test_es_forward.py @@ -1,6 +1,5 @@ import pytest -from addcorpus.conftest import basic_corpus from api.models import Query from es.search import hits from visualization.query import MATCH_ALL @@ -110,10 +109,10 @@ def test_search_history_is_saved(mock_corpus, times_user, es_forward_client, cli assert times_user.queries.count() == 1 -def test_unauthenticated_search(client, basic_corpus, basic_corpus_index): +def test_unauthenticated_search(client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus): queries_before_search = Query.objects.count() response = client.post( - f'/api/es/{basic_corpus}/_search', + f'/api/es/{basic_mock_corpus}/_search', {'es_query': MATCH_ALL}, content_type='application/json', ) diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index f1199a51f..a3f013c2b 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -10,7 +10,6 @@ def test_corpus_path(*path): 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), 'media-mock-corpus': test_corpus_path('media', 'media_mock_corpus.py'), 'mock-csv-corpus': test_corpus_path('basic', 'mock_csv_corpus.py'), - 'mock-basic-corpus': test_corpus_path('basic', 'mock_basic_corpus.py'), 'wordmodels-mock-corpus': test_corpus_path('wordmodels', 'wm_mock_corpus.py'), 'tagging-mock-corpus': test_corpus_path('tag', 'tag_mock_corpus.py'), } From 0c7c919de492531b8d5922e012843aee2db3a63e Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 14:59:55 +0200 Subject: [PATCH 25/94] remove session-level fixture in es conftest --- backend/es/conftest.py | 38 ++++------------------------- backend/es/tests/test_es_forward.py | 32 ++++++++++++------------ 2 files changed, 21 insertions(+), 49 deletions(-) diff --git a/backend/es/conftest.py b/backend/es/conftest.py index 0dd97acc5..669193c4b 100644 --- a/backend/es/conftest.py +++ b/backend/es/conftest.py @@ -18,34 +18,6 @@ def corpus_definition(mock_corpus): yield corpus -@pytest.fixture(scope='module') -def es_forward_client(es_client, mock_corpus): - """ - Create and populate an index for the mock corpus in elasticsearch. - Returns an elastic search client for the mock corpus. - """ - - # add data from mock corpus - corpus = load_corpus_definition(mock_corpus) - es_index.create(es_client, corpus, False, True, False) - es_index.populate(es_client, mock_corpus, corpus) - - es_client.index(index=corpus.es_index, document={'content': 'banana'}) - - # ES is "near real time", so give it a second before we start searching the index - sleep(1) - yield es_client - # delete index when done - es_client.indices.delete(index='times-test') - - -@pytest.fixture() -def empty_corpus_index(es_client, basic_mock_corpus): - corpus = load_corpus_definition(basic_mock_corpus) - es_index.create(es_client, corpus, False, True, False) - yield es_client - es_client.indices.delete(index=corpus.es_index) - @pytest.fixture() def es_index_client(es_client, mock_corpus): """ @@ -77,13 +49,13 @@ def es_alias_client(es_client, mock_corpus): for index in indices.keys(): es_client.indices.delete(index=index) + @pytest.fixture() -def times_user(auth_user, mock_corpus): - group = Group.objects.create(name='times-access') - corpus = Corpus.objects.get(name=mock_corpus) +def small_mock_corpus_user(auth_user, small_mock_corpus): + group = Group.objects.create(name='corpus access') + corpus = Corpus.objects.get(name=small_mock_corpus) corpus.groups.add(group) corpus.save() auth_user.groups.add(group) auth_user.save() - yield auth_user - group.delete() + return auth_user diff --git a/backend/es/tests/test_es_forward.py b/backend/es/tests/test_es_forward.py index 7dd954c83..f44691de0 100644 --- a/backend/es/tests/test_es_forward.py +++ b/backend/es/tests/test_es_forward.py @@ -7,10 +7,10 @@ FORWARD_CASES = { 'search_restricted_corpus': ( False, - '/api/es/times/_search?size=20&scroll=3m', + '/api/es/small-mock-corpus/_search?size=20&scroll=3m', { 'es_query': {'query': {'bool': { 'must': {'simple_query_string': { - 'query': 'banana', + 'query': 'universally', 'lenient': True, 'default_operator': 'or', }}, @@ -21,7 +21,7 @@ ), 'search_bogus': ( True, - '/api/es/times/_search?size=20&scroll=3m', + '/api/es/small-mock-corpus/_search?size=20&scroll=3m', { 'es_query': {'query': {'bool': { 'must': {'simple_query_string': { 'query': 'pineapple', @@ -35,10 +35,10 @@ ), 'search_nonexistent': ( True, - '/api/es/daily-mail/_search?size=20&scroll=3m', + '/api/es/nonexistent-corpus/_search?size=20&scroll=3m', { 'es_query': {'query': {'bool': { 'must': {'simple_query_string': { - 'query': 'banana', + 'query': 'universally', 'lenient': True, 'default_operator': 'or', }}, @@ -49,17 +49,17 @@ ), 'search_empty': ( True, - '/api/es/times/_search?size=20&scroll=3m', + '/api/es/small-mock-corpus/_search?size=20&scroll=3m', {}, 3, 200, ), 'search_success': ( True, - '/api/es/times/_search?size=20&scroll=3m', + '/api/es/small-mock-corpus/_search?size=20&scroll=3m', {'es_query': {'query': {'bool': { 'must': {'simple_query_string': { - 'query': 'banana', + 'query': 'universally', 'lenient': True, 'default_operator': 'or', }}, @@ -74,12 +74,12 @@ def scenario(request): return request.param -def test_es_forwarding_views(scenario, es_forward_client, client, times_user): +def test_es_forwarding_views(scenario, index_small_mock_corpus, client, small_mock_corpus_user): (authenticate, route, data, n_hits, status) = scenario if authenticate: - client.force_login(times_user) + client.force_login(small_mock_corpus_user) response = client.post(route, data, content_type = 'application/json') assert response.status_code == status @@ -87,13 +87,13 @@ def test_es_forwarding_views(scenario, es_forward_client, client, times_user): if response.status_code == 200: assert len(hits(response.data)) == n_hits -def test_search_history_is_saved(mock_corpus, times_user, es_forward_client, client): - assert times_user.queries.count() == 0 +def test_search_history_is_saved(small_mock_corpus, small_mock_corpus_user, index_small_mock_corpus, client): + assert small_mock_corpus_user.queries.count() == 0 - client.force_login(times_user) + client.force_login(small_mock_corpus_user) search = lambda: client.post( - '/api/es/times/_search', + f'/api/es/{small_mock_corpus}/_search', {'es_query': MATCH_ALL}, content_type='application/json', ) @@ -101,12 +101,12 @@ def test_search_history_is_saved(mock_corpus, times_user, es_forward_client, cli response = search() assert response.status_code == 200 - assert times_user.queries.count() == 1 + assert small_mock_corpus_user.queries.count() == 1 response2 = search() assert response2.status_code == 200 - assert times_user.queries.count() == 1 + assert small_mock_corpus_user.queries.count() == 1 def test_unauthenticated_search(client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus): From 852f8f5c7803d96dee24f1307b114e69a6baa70a Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 15:04:01 +0200 Subject: [PATCH 26/94] remove unused import --- backend/es/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/es/conftest.py b/backend/es/conftest.py index 669193c4b..ac99c36a7 100644 --- a/backend/es/conftest.py +++ b/backend/es/conftest.py @@ -1,5 +1,4 @@ import pytest -from time import sleep from django.contrib.auth.models import Group from addcorpus.python_corpora.load_corpus import load_corpus_definition From 50225fb5f6e9040ce78ee9e55a0bdfc4e74609ed Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 12 Apr 2024 15:11:41 +0200 Subject: [PATCH 27/94] code cleanup --- backend/addcorpus/conftest.py | 2 -- backend/conftest.py | 6 +++-- backend/media/conftest.py | 5 ---- backend/media/tests/test_media.py | 4 +-- backend/tag/conftest.py | 13 +++------- backend/tag/tests/test_tag_filter.py | 24 +++++++++--------- backend/tag/tests/test_tag_models.py | 4 +-- backend/tag/tests/test_views.py | 38 ++++++++++++++-------------- backend/visualization/conftest.py | 2 +- 9 files changed, 44 insertions(+), 54 deletions(-) diff --git a/backend/addcorpus/conftest.py b/backend/addcorpus/conftest.py index 6aff4e03f..eeb6d495c 100644 --- a/backend/addcorpus/conftest.py +++ b/backend/addcorpus/conftest.py @@ -1,5 +1,4 @@ import pytest -import os from django.contrib.auth.models import Group from addcorpus.models import Corpus @@ -13,4 +12,3 @@ def group_with_access(db, basic_mock_corpus): yield group group.delete() -here = os.path.abspath(os.path.dirname(__file__)) diff --git a/backend/conftest.py b/backend/conftest.py index bb4e61013..e128a214f 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -1,6 +1,5 @@ import json from time import sleep -import shutil import os import pytest import requests @@ -9,7 +8,6 @@ from addcorpus.json_corpora.import_json import import_json_corpus from ianalyzer.elasticsearch import elasticsearch -from ianalyzer.settings_test import MEDIA_ROOT from addcorpus.python_corpora.load_corpus import load_corpus_definition from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora from es import es_index as index @@ -132,6 +130,10 @@ def large_mock_corpus() -> str: def ml_mock_corpus() -> str: return 'multilingual-mock-corpus' +@pytest.fixture() +def media_mock_corpus() -> str: + return 'media-mock-corpus' + @pytest.fixture() def tag_mock_corpus() -> str: diff --git a/backend/media/conftest.py b/backend/media/conftest.py index eb7fa01a3..e69de29bb 100644 --- a/backend/media/conftest.py +++ b/backend/media/conftest.py @@ -1,5 +0,0 @@ -import pytest - -@pytest.fixture() -def mock_corpus(): - return 'media-mock-corpus' diff --git a/backend/media/tests/test_media.py b/backend/media/tests/test_media.py index a8034ad6e..c79c4725d 100644 --- a/backend/media/tests/test_media.py +++ b/backend/media/tests/test_media.py @@ -9,10 +9,10 @@ expected_url = f'/api/get_media?corpus=media-mock-corpus&image_path=images%2Fhamlet.png' -def test_media_views(client, mock_corpus, admin_client): +def test_media_views(client, media_mock_corpus, admin_client): response = admin_client.post( '/api/request_media', - {'corpus': mock_corpus, 'document': example_document}, + {'corpus': media_mock_corpus, 'document': example_document}, content_type='application/json' ) assert status.is_success(response.status_code) diff --git a/backend/tag/conftest.py b/backend/tag/conftest.py index d78310739..2e2abd5bc 100644 --- a/backend/tag/conftest.py +++ b/backend/tag/conftest.py @@ -3,14 +3,9 @@ from addcorpus.models import Corpus from tag.models import DOCS_PER_TAG_LIMIT, Tag, TaggedDocument -@pytest.fixture(scope='session') -def mock_corpus(): - return 'tagging-mock-corpus' - - @pytest.fixture() -def mock_corpus_obj(db, mock_corpus): - return Corpus.objects.get(name=mock_corpus) +def mock_corpus_obj(db, tag_mock_corpus): + return Corpus.objects.get(name=tag_mock_corpus) @pytest.fixture() @@ -86,8 +81,8 @@ def other_corpus(db): return name @pytest.fixture() -def multiple_tags(db, mock_corpus, auth_user): - corpus = Corpus.objects.get(name=mock_corpus) +def multiple_tags(db, tag_mock_corpus, auth_user): + corpus = Corpus.objects.get(name=tag_mock_corpus) riveting_tag = Tag.objects.create( name='riveting', user=auth_user diff --git a/backend/tag/tests/test_tag_filter.py b/backend/tag/tests/test_tag_filter.py index e22fc72b3..eb2ede93c 100644 --- a/backend/tag/tests/test_tag_filter.py +++ b/backend/tag/tests/test_tag_filter.py @@ -3,29 +3,29 @@ from visualization.query import set_query_text, MATCH_ALL -def test_tag_document_ids(mock_corpus, auth_user_tag, tagged_documents): +def test_tag_document_ids(tag_mock_corpus, auth_user_tag, tagged_documents): _, docs = tagged_documents - assert len(tag_document_ids([auth_user_tag], mock_corpus)) == auth_user_tag.count + assert len(tag_document_ids([auth_user_tag], tag_mock_corpus)) == auth_user_tag.count -def test_tag_filter(mock_corpus, index_tag_mock_corpus, auth_user_tag, tagged_documents): - filter = tag_filter([auth_user_tag.id], mock_corpus) +def test_tag_filter(tag_mock_corpus, index_tag_mock_corpus, auth_user_tag, tagged_documents): + filter = tag_filter([auth_user_tag.id], tag_mock_corpus) query = {'query': filter} - results = search.search(mock_corpus, query) + results = search.search(tag_mock_corpus, query) assert search.total_hits(results) == auth_user_tag.count -def test_search_with_tag(mock_corpus, index_tag_mock_corpus, auth_user_tag, tagged_documents): +def test_search_with_tag(tag_mock_corpus, index_tag_mock_corpus, auth_user_tag, tagged_documents): query = set_query_text(MATCH_ALL, 'text') - results = search.search(mock_corpus, query) + results = search.search(tag_mock_corpus, query) assert search.total_hits(results) == 2 - query_with_tag = include_tag_filter(query, [auth_user_tag.id], mock_corpus) + query_with_tag = include_tag_filter(query, [auth_user_tag.id], tag_mock_corpus) - results_with_tag = search.search(mock_corpus, query_with_tag) + results_with_tag = search.search(tag_mock_corpus, query_with_tag) assert search.total_hits(results_with_tag) == 1 -def test_search_multiple_tags(mock_corpus, index_tag_mock_corpus, multiple_tags): +def test_search_multiple_tags(tag_mock_corpus, index_tag_mock_corpus, multiple_tags): ids = [tag.id for tag in multiple_tags] - query = include_tag_filter(MATCH_ALL, ids, mock_corpus) - results = search.search(mock_corpus, query) + query = include_tag_filter(MATCH_ALL, ids, tag_mock_corpus) + results = search.search(tag_mock_corpus, query) assert search.total_hits(results) == 2 diff --git a/backend/tag/tests/test_tag_models.py b/backend/tag/tests/test_tag_models.py index 2d0393ce5..d32b01e7e 100644 --- a/backend/tag/tests/test_tag_models.py +++ b/backend/tag/tests/test_tag_models.py @@ -10,10 +10,10 @@ def test_tag_models(db, auth_user, auth_user_tag, tagged_documents): assert auth_user_tag.count == 3 -def test_tag_lookup(mock_corpus, tagged_documents, +def test_tag_lookup(tag_mock_corpus, tagged_documents, auth_user_tag, admin_user_tag): instances, docs = tagged_documents - corpus = Corpus.objects.get(name=mock_corpus) + corpus = Corpus.objects.get(name=tag_mock_corpus) for doc, instance in zip(docs, instances): tagged_docs = TaggedDocument.objects.get(doc_id=doc) diff --git a/backend/tag/tests/test_views.py b/backend/tag/tests/test_views.py index 976923473..9aed97e05 100644 --- a/backend/tag/tests/test_views.py +++ b/backend/tag/tests/test_views.py @@ -58,8 +58,8 @@ def test_admin_delete(admin_client, auth_user_tag): assert resp.status_code == status.HTTP_204_NO_CONTENT assert n_tags() == 0 -def test_list_corpus_tags(auth_client, auth_user_tag, tagged_documents, mock_corpus, other_corpus): - response = auth_client.get(f'/api/tag/tags/?corpus={mock_corpus}') +def test_list_corpus_tags(auth_client, auth_user_tag, tagged_documents, tag_mock_corpus, other_corpus): + response = auth_client.get(f'/api/tag/tags/?corpus={tag_mock_corpus}') assert status.is_success(response.status_code) assert len(response.data) == 1 @@ -73,20 +73,20 @@ def test_list_corpus_tags(auth_client, auth_user_tag, tagged_documents, mock_cor not_found = auth_client.get('/api/tag/tags/?corpus=nonexistent') assert not_found.status_code == status.HTTP_404_NOT_FOUND -def test_get_document_tags(auth_user, auth_client, auth_user_tag, tagged_documents, mock_corpus): +def test_get_document_tags(auth_user, auth_client, auth_user_tag, tagged_documents, tag_mock_corpus): doc_id = tagged_documents[1][0] - response = auth_client.get(f'/api/tag/document_tags/{mock_corpus}/{doc_id}') + response = auth_client.get(f'/api/tag/document_tags/{tag_mock_corpus}/{doc_id}') assert status.is_success(response.status_code) - response = auth_client.get(f'/api/tag/document_tags/{mock_corpus}/not-tagged') + response = auth_client.get(f'/api/tag/document_tags/{tag_mock_corpus}/not-tagged') assert status.is_success(response.status_code) -def test_patch_document_tags(auth_client, auth_user_tag, mock_corpus, auth_user_corpus_acces): +def test_patch_document_tags(auth_client, auth_user_tag, tag_mock_corpus, auth_user_corpus_acces): assert auth_user_tag.count == 0 new_doc = 'a-new-document' patch_request = lambda data: auth_client.patch( - f'/api/tag/document_tags/{mock_corpus}/{new_doc}', + f'/api/tag/document_tags/{tag_mock_corpus}/{new_doc}', data, content_type='application/json' ) @@ -110,10 +110,10 @@ def test_patch_document_tags(auth_client, auth_user_tag, mock_corpus, auth_user_ assert auth_user_tag.count == 0 -def test_assign_multiple_tags_at_once(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces): +def test_assign_multiple_tags_at_once(auth_client, multiple_tags, tag_mock_corpus, auth_user_corpus_acces): doc = 'test' patch_request = lambda data: auth_client.patch( - f'/api/tag/document_tags/{mock_corpus}/{doc}', + f'/api/tag/document_tags/{tag_mock_corpus}/{doc}', data, content_type='application/json' ) @@ -125,10 +125,10 @@ def test_assign_multiple_tags_at_once(auth_client, multiple_tags, mock_corpus, a doc = TaggedDocument.objects.get(doc_id=doc) assert doc.tags.count() == len(multiple_tags) -def test_assign_multiple_tags_one_by_one(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces): +def test_assign_multiple_tags_one_by_one(auth_client, multiple_tags, tag_mock_corpus, auth_user_corpus_acces): doc = 'test' patch_request = lambda data: auth_client.patch( - f'/api/tag/document_tags/{mock_corpus}/{doc}', + f'/api/tag/document_tags/{tag_mock_corpus}/{doc}', data, content_type='application/json' ) @@ -142,13 +142,13 @@ def test_assign_multiple_tags_one_by_one(auth_client, multiple_tags, mock_corpus doc = TaggedDocument.objects.get(doc_id=doc) assert doc.tags.count() == i + 1 -def test_patch_tags_contamination(auth_client, auth_user_tag, admin_user_tag, mock_corpus, mock_corpus_obj, auth_user_corpus_acces): +def test_patch_tags_contamination(auth_client, auth_user_tag, admin_user_tag, tag_mock_corpus, mock_corpus_obj, auth_user_corpus_acces): ''' Verify that patching tags does not affect the tags of other users ''' document = 'some-document' - route = f'/api/tag/document_tags/{mock_corpus}/{document}' + route = f'/api/tag/document_tags/{tag_mock_corpus}/{document}' kwargs = {'content_type': 'application/json'} doc = TaggedDocument.objects.create(corpus=mock_corpus_obj, doc_id=document) @@ -180,17 +180,17 @@ def search_with_tag(client, corpus_name, tag_id): } return client.post(route, data, content_type = 'application/json') -def test_search_view_with_tag(auth_client, mock_corpus, auth_user_tag, tagged_documents, index_tag_mock_corpus): - response = search_with_tag(auth_client, mock_corpus, auth_user_tag.id) +def test_search_view_with_tag(auth_client, tag_mock_corpus, auth_user_tag, tagged_documents, index_tag_mock_corpus): + response = search_with_tag(auth_client, tag_mock_corpus, auth_user_tag.id) assert status.is_success(response.status_code) assert len(hits(response.data)) == auth_user_tag.count -def test_search_view_unauthorized_tag(auth_client, mock_corpus, admin_user_tag, auth_user_corpus_acces): - response = search_with_tag(auth_client, mock_corpus, admin_user_tag.id) +def test_search_view_unauthorized_tag(auth_client, tag_mock_corpus, admin_user_tag, auth_user_corpus_acces): + response = search_with_tag(auth_client, tag_mock_corpus, admin_user_tag.id) assert response.status_code == status.HTTP_403_FORBIDDEN -def test_search_view_nonexistent_tag(auth_client, mock_corpus, auth_user_corpus_acces): +def test_search_view_nonexistent_tag(auth_client, tag_mock_corpus, auth_user_corpus_acces): not_a_real_tag = 12345678 - response = search_with_tag(auth_client, mock_corpus, not_a_real_tag) + response = search_with_tag(auth_client, tag_mock_corpus, not_a_real_tag) assert response.status_code == status.HTTP_404_NOT_FOUND diff --git a/backend/visualization/conftest.py b/backend/visualization/conftest.py index 59af92409..79ed94006 100644 --- a/backend/visualization/conftest.py +++ b/backend/visualization/conftest.py @@ -68,7 +68,7 @@ def es_client_k_hits(): ''' return MockClient(500) -@pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus'], scope='session') +@pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus']) def mock_corpus(request): 'parametrised version of the mock corpus fixtures: runs with both' From 5a282748330ff452372bf2e009335f458880e32a Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 15:41:35 +0200 Subject: [PATCH 28/94] draft export function --- backend/addcorpus/es_mappings.py | 4 + backend/addcorpus/json_corpora/constants.py | 2 + backend/addcorpus/json_corpora/export_json.py | 81 +++++++++++++++++++ backend/addcorpus/json_corpora/import_json.py | 6 +- .../json_corpora/tests/test_export.py | 7 ++ backend/addcorpus/models.py | 1 - backend/addcorpus/validation/creation.py | 4 +- 7 files changed, 98 insertions(+), 7 deletions(-) create mode 100644 backend/addcorpus/json_corpora/constants.py create mode 100644 backend/addcorpus/json_corpora/export_json.py create mode 100644 backend/addcorpus/json_corpora/tests/test_export.py diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py index 1451a92d8..921870a65 100644 --- a/backend/addcorpus/es_mappings.py +++ b/backend/addcorpus/es_mappings.py @@ -1,5 +1,9 @@ +from typing import Dict from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available +def primary_mapping_type(es_mapping: Dict) -> str: + return es_mapping.get('type', None) + def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True): ''' Mapping for the main content field. Options: diff --git a/backend/addcorpus/json_corpora/constants.py b/backend/addcorpus/json_corpora/constants.py new file mode 100644 index 000000000..66ac0c310 --- /dev/null +++ b/backend/addcorpus/json_corpora/constants.py @@ -0,0 +1,2 @@ +DEFAULT_CSV_DELIMITER = ',' +DATE_FORMAT = '%Y-%m-%d' diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py new file mode 100644 index 000000000..a72584c88 --- /dev/null +++ b/backend/addcorpus/json_corpora/export_json.py @@ -0,0 +1,81 @@ +from typing import Dict +from datetime import date +from addcorpus.models import Corpus, CorpusConfiguration, Field +from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT + +def export_json_corpus(corpus: Corpus) -> Dict: + config = corpus.configuration + data = {'name': corpus.name} + data['meta'] = export_corpus_meta(config) + data['source_data'] = export_corpus_source_data(config) + data['options'] = export_corpus_options(config) + data['fields'] = [ + export_json_field(field) for field in config.fields.all() + ] + return data + +def export_corpus_meta(configuration: CorpusConfiguration) -> Dict: + return { + 'title': configuration.title, + 'category': configuration.category, + 'description': configuration.description, + 'languages': configuration.languages, + 'date_range': { + 'min': export_date(configuration.min_date), + 'max': export_date(configuration.max_date), + } + } + +def export_date(date: date): + return date.strftime(DATE_FORMAT) + +def export_corpus_source_data(configuration: CorpusConfiguration) -> Dict: + data = { + 'type': 'csv' + } + if configuration.source_data_delimiter != DEFAULT_CSV_DELIMITER: + data['options'] = {'delimiter': configuration.source_data_delimiter} + return data + +def export_corpus_options(configuration: CorpusConfiguration) -> Dict: + return {} + + +def export_json_field(field: Field) -> Dict: + return { + 'name': field.name, + 'display_name': field.display_name, + 'description': field.description, + 'type': export_field_type(field), + 'options': export_field_options(field), + 'extract': export_field_extract(field) + } + + +def export_field_type(field: Field) -> str: + if field.display_type == 'text' or field.display_type == 'keyword': + return 'text_metadata' + return field.display_type + + +def export_field_options(field: Field) -> Dict: + return { + 'filter': export_field_filter(field), + 'hidden': field.hidden, + 'preview': field.results_overview, + 'search': field.searchable, + 'sort': field.sortable, + 'visualize': len(field.visualizations) > 0 + } + + +def export_field_filter(field: Field) -> str: + if field.search_filter != {}: + return 'show' + if field.display_type == 'text_content': + return 'none' + return 'hide' + + +def export_field_extract(field: Field) -> Dict: + return {'column': field.extract_column} diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py index a9a0ae298..4650a0c84 100644 --- a/backend/addcorpus/json_corpora/import_json.py +++ b/backend/addcorpus/json_corpora/import_json.py @@ -8,7 +8,7 @@ from addcorpus.constants import VisualizationType from addcorpus.validation.publishing import _any_date_fields from django.conf import settings - +from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT def import_json_corpus(data: Dict) -> Corpus: name = get_path(data, 'name') @@ -51,12 +51,12 @@ def _parse_configuration(data: Dict, configuration: CorpusConfiguration) -> Corp configuration.document_context = get_path( data, 'options', 'document_context') or {} configuration.source_data_delimiter = get_path( - data, 'source_data', 'options', 'delimiter') or ',' + data, 'source_data', 'options', 'delimiter') or DEFAULT_CSV_DELIMITER return configuration def _parse_date(date: str): - return datetime.strptime(date, '%Y-%m-%d').date() + return datetime.strptime(date, DATE_FORMAT).date() def _import_fields(data: Dict, configuration: CorpusConfiguration) -> None: diff --git a/backend/addcorpus/json_corpora/tests/test_export.py b/backend/addcorpus/json_corpora/tests/test_export.py new file mode 100644 index 000000000..59c379044 --- /dev/null +++ b/backend/addcorpus/json_corpora/tests/test_export.py @@ -0,0 +1,7 @@ +from addcorpus.json_corpora.export_json import export_json_corpus +from addcorpus.models import Corpus + +def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data): + result = export_json_corpus(json_mock_corpus) + assert result == json_corpus_data + diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py index 532ac5e18..8b8654574 100644 --- a/backend/addcorpus/models.py +++ b/backend/addcorpus/models.py @@ -26,7 +26,6 @@ MAX_LENGTH_DESCRIPTION = 254 MAX_LENGTH_TITLE = 256 - class Corpus(models.Model): name = models.SlugField( max_length=MAX_LENGTH_NAME, diff --git a/backend/addcorpus/validation/creation.py b/backend/addcorpus/validation/creation.py index 196ca9b95..8b0a2666a 100644 --- a/backend/addcorpus/validation/creation.py +++ b/backend/addcorpus/validation/creation.py @@ -11,12 +11,10 @@ from addcorpus.python_corpora.filters import \ VALID_MAPPINGS as VALID_SEARCH_FILTER_MAPPINGS from django.core.exceptions import ValidationError - +from addcorpus.es_mappings import primary_mapping_type from langcodes import tag_is_valid -def primary_mapping_type(es_mapping): - return es_mapping.get('type', None) def supports_full_text_search(es_mapping): is_text = primary_mapping_type(es_mapping) == MappingType.TEXT.value From 806df7d5a816fd08392b1b505e1aa6beecb618b3 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 15:45:27 +0200 Subject: [PATCH 29/94] export corpus options --- backend/addcorpus/json_corpora/export_json.py | 13 +++++++++++-- backend/corpora_test/mock_corpus.json | 3 +-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py index a72584c88..ae47bc8df 100644 --- a/backend/addcorpus/json_corpora/export_json.py +++ b/backend/addcorpus/json_corpora/export_json.py @@ -8,7 +8,9 @@ def export_json_corpus(corpus: Corpus) -> Dict: data = {'name': corpus.name} data['meta'] = export_corpus_meta(config) data['source_data'] = export_corpus_source_data(config) - data['options'] = export_corpus_options(config) + options = export_corpus_options(config) + if options: + data['options'] = options data['fields'] = [ export_json_field(field) for field in config.fields.all() ] @@ -38,7 +40,14 @@ def export_corpus_source_data(configuration: CorpusConfiguration) -> Dict: return data def export_corpus_options(configuration: CorpusConfiguration) -> Dict: - return {} + data = {} + if configuration.document_context: + data['document_context'] = configuration.document_context + if configuration.default_sort: + data['default_sort'] = configuration.default_sort + if configuration.language_field: + data['language_field'] = configuration.language_field + return data def export_json_field(field: Field) -> Dict: diff --git a/backend/corpora_test/mock_corpus.json b/backend/corpora_test/mock_corpus.json index 794dcfb72..710b68f12 100644 --- a/backend/corpora_test/mock_corpus.json +++ b/backend/corpora_test/mock_corpus.json @@ -47,6 +47,5 @@ "column": "line" } } - ], - "options": {} + ] } From e45eae45321a6b0017cb4e15c968bb295eb7bce5 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 16:02:24 +0200 Subject: [PATCH 30/94] add url display type --- backend/addcorpus/json_corpora/export_json.py | 2 +- backend/addcorpus/json_corpora/import_json.py | 2 +- .../migrations/0022_add_url_display_type.py | 18 ++++++++++++++++++ backend/addcorpus/models.py | 3 ++- backend/corpora/dbnl/dbnl.py | 1 + .../dutchnewspapers/dutchnewspapers_public.py | 1 + backend/corpora/goodreads/goodreads.json | 2 +- backend/corpora/goodreads/goodreads.py | 1 + .../corpora/parliament/utils/field_defaults.py | 1 + .../peaceportal/utils/field_defaults.py | 1 + backend/corpora/rechtspraak/rechtspraak.py | 1 + .../document-view/document-view.component.ts | 2 +- frontend/src/app/models/corpus.ts | 3 ++- 13 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 backend/addcorpus/migrations/0022_add_url_display_type.py diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py index ae47bc8df..7e9bd11f1 100644 --- a/backend/addcorpus/json_corpora/export_json.py +++ b/backend/addcorpus/json_corpora/export_json.py @@ -81,7 +81,7 @@ def export_field_options(field: Field) -> Dict: def export_field_filter(field: Field) -> str: if field.search_filter != {}: return 'show' - if field.display_type == 'text_content': + if field.display_type == 'text_content' or field.display_type == 'url': return 'none' return 'hide' diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py index 4650a0c84..d0e953288 100644 --- a/backend/addcorpus/json_corpora/import_json.py +++ b/backend/addcorpus/json_corpora/import_json.py @@ -189,7 +189,7 @@ def _parse_language(field_data: Dict) -> str: def _parse_url_field(field: Field, field_data: Dict) -> Field: field.es_mapping = es_mappings.keyword_mapping() - field.display_type = 'keyword' + field.display_type = 'url' field.search_filter = {} return field diff --git a/backend/addcorpus/migrations/0022_add_url_display_type.py b/backend/addcorpus/migrations/0022_add_url_display_type.py new file mode 100644 index 000000000..24548c34a --- /dev/null +++ b/backend/addcorpus/migrations/0022_add_url_display_type.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.10 on 2024-04-16 13:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('addcorpus', '0021_corpusconfiguration_data_directory'), + ] + + operations = [ + migrations.AlterField( + model_name='field', + name='display_type', + field=models.CharField(choices=[('text_content', 'text content'), ('text', 'text'), ('keyword', 'keyword'), ('date', 'date'), ('date_range', 'date_range'), ('integer', 'integer'), ('float', 'float'), ('boolean', 'boolean'), ('geo_point', 'geo_point'), ('url', 'url')], help_text='as what type of data this field is rendered in the interface', max_length=16), + ), + ] diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py index 8b8654574..01aa33535 100644 --- a/backend/addcorpus/models.py +++ b/backend/addcorpus/models.py @@ -261,7 +261,8 @@ def clean(self): (MappingType.INTEGER.value, 'integer'), (MappingType.FLOAT.value, 'float'), (MappingType.BOOLEAN.value, 'boolean'), - (MappingType.GEO_POINT.value, 'geo_point') + (MappingType.GEO_POINT.value, 'geo_point'), + ('url', 'url'), ] FIELD_VISUALIZATIONS = [ diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py index 33933bb44..2e1f02a85 100644 --- a/backend/corpora/dbnl/dbnl.py +++ b/backend/corpora/dbnl/dbnl.py @@ -234,6 +234,7 @@ def _xml_files(self): url = FieldDefinition( name='url', display_name='Source URL', + display_type='url', description='Link to the book\'s page in DBNL', extractor=Metadata('url'), es_mapping=keyword_mapping(), diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index a7962df50..ebf459893 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -134,6 +134,7 @@ def fields(self): name="url", display_name="Delpher URL", description="Link to record on Delpher", + display_type='url', es_mapping=keyword_mapping(), extractor=XML(tag='identifier', toplevel=True, diff --git a/backend/corpora/goodreads/goodreads.json b/backend/corpora/goodreads/goodreads.json index adfda1eab..3feb3cf7c 100644 --- a/backend/corpora/goodreads/goodreads.json +++ b/backend/corpora/goodreads/goodreads.json @@ -149,7 +149,7 @@ "name": "url", "display_name": "Source URL", "description": "Link to the review on Goodreads", - "type": "text_metadata", + "type": "url", "options": { "search": false, "filter": "none", diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py index 674c686bc..eefb1bbb8 100644 --- a/backend/corpora/goodreads/goodreads.py +++ b/backend/corpora/goodreads/goodreads.py @@ -162,6 +162,7 @@ def sources(self, start, end): FieldDefinition( name='url', display_name='Source URL', + display_type='url', description='Link to the the review on Goodreads', extractor=CSV('url'), es_mapping={'type': 'keyword'}, diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py index 1ca17f3a3..9365ba1b5 100644 --- a/backend/corpora/parliament/utils/field_defaults.py +++ b/backend/corpora/parliament/utils/field_defaults.py @@ -547,6 +547,7 @@ def url(): return FieldDefinition( name='url', display_name='Source URL', + display_type='url', description='URL to source file of this speech', es_mapping=keyword_mapping(), searchable=False, diff --git a/backend/corpora/peaceportal/utils/field_defaults.py b/backend/corpora/peaceportal/utils/field_defaults.py index 21f8943fe..1800e25eb 100644 --- a/backend/corpora/peaceportal/utils/field_defaults.py +++ b/backend/corpora/peaceportal/utils/field_defaults.py @@ -30,6 +30,7 @@ def url(): return FieldDefinition( name='url', display_name='Source URL', + display_type='url', description='URL of the inscription entry in the source database.', es_mapping=keyword_mapping(), search_field_core=True diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py index 47b3b2d68..3f07a3488 100644 --- a/backend/corpora/rechtspraak/rechtspraak.py +++ b/backend/corpora/rechtspraak/rechtspraak.py @@ -309,6 +309,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='url', display_name='Source URL', + display_type='url', description='URL of the case on rechtspraak.nl', es_mapping=keyword_mapping(), extractor=rdf_description_extractor( diff --git a/frontend/src/app/document-view/document-view.component.ts b/frontend/src/app/document-view/document-view.component.ts index 3452d4906..efb276d85 100644 --- a/frontend/src/app/document-view/document-view.component.ts +++ b/frontend/src/app/document-view/document-view.component.ts @@ -69,7 +69,7 @@ export class DocumentViewComponent implements OnChanges { } isUrlField(field: CorpusField) { - return field.name === 'url' || field.name.startsWith('url_'); + return field.displayType === 'url'; } isGeoPointField(field: CorpusField) { diff --git a/frontend/src/app/models/corpus.ts b/frontend/src/app/models/corpus.ts index 5aaaeaa19..04d71cae1 100644 --- a/frontend/src/app/models/corpus.ts +++ b/frontend/src/app/models/corpus.ts @@ -64,7 +64,8 @@ export interface DocumentContext { } -export type FieldDisplayType = 'text_content' | 'px' | 'keyword' | 'integer' | 'text' | 'date' | 'boolean'; +export type FieldDisplayType = + 'text_content' | 'px' | 'keyword' | 'integer' | 'text' | 'date' | 'boolean' | 'url'; /** Corpus field info as sent by the backend api */ export interface ApiCorpusField { From fd84a9a06128a7c8b3d0de77a551f6f4bee4251f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 16:06:05 +0200 Subject: [PATCH 31/94] set filter to none for text mapping --- backend/addcorpus/json_corpora/export_json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py index 7e9bd11f1..80426847c 100644 --- a/backend/addcorpus/json_corpora/export_json.py +++ b/backend/addcorpus/json_corpora/export_json.py @@ -2,6 +2,7 @@ from datetime import date from addcorpus.models import Corpus, CorpusConfiguration, Field from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT +from addcorpus.es_mappings import primary_mapping_type def export_json_corpus(corpus: Corpus) -> Dict: config = corpus.configuration @@ -81,7 +82,7 @@ def export_field_options(field: Field) -> Dict: def export_field_filter(field: Field) -> str: if field.search_filter != {}: return 'show' - if field.display_type == 'text_content' or field.display_type == 'url': + if primary_mapping_type(field.es_mapping) == 'text' or field.display_type == 'url': return 'none' return 'hide' From ae7d9da83a73dcfdbb7445a5f5b6261308f70a3b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 16:12:01 +0200 Subject: [PATCH 32/94] use fixtures for json field definitions --- backend/addcorpus/json_corpora/conftest.py | 128 ++++++++++++++++ .../json_corpora/tests/test_import.py | 141 ++---------------- 2 files changed, 142 insertions(+), 127 deletions(-) create mode 100644 backend/addcorpus/json_corpora/conftest.py diff --git a/backend/addcorpus/json_corpora/conftest.py b/backend/addcorpus/json_corpora/conftest.py new file mode 100644 index 000000000..05d2549f5 --- /dev/null +++ b/backend/addcorpus/json_corpora/conftest.py @@ -0,0 +1,128 @@ +import pytest + +@pytest.fixture() +def content_field_json(): + return { + 'name': 'content', + 'display_name': 'Content', + 'description': 'Bla bla bla', + 'type': 'text_content', + 'language': 'en', + 'options': { + 'search': True, + 'filter': 'none', + 'preview': True, + 'visualize': True, + 'sort': False, + 'hidden': False + }, + 'extract': {'column': 'content'} + } + +@pytest.fixture() +def keyword_field_json(): + return { + 'name': 'author', + 'display_name': 'Author', + 'description': 'Author of the text', + 'type': 'text_metadata', + 'options': { + 'search': True, + 'filter': 'show', + 'preview': True, + 'visualize': True, + 'sort': False, + 'hidden': False + }, + 'extract': {'column': 'author'} + } + +@pytest.fixture() +def int_field_json(): + return { + 'name': 'year', + 'display_name': 'Year', + 'description': 'Year in which the text was written', + 'type': 'integer', + 'options': { + 'search': False, + 'filter': 'show', + 'preview': False, + 'visualize': True, + 'sort': True, + 'hidden': False + }, + 'extract': {'column': 'year'} + } + +@pytest.fixture() +def float_field_json(): + return { + 'name': 'ocr_confidence', + 'display_name': 'OCR confidence', + 'description': 'Confidence level of optical character recognition output', + 'type': 'float', + 'options': { + 'search': False, + 'filter': 'hide', + 'preview': False, + 'visualize': False, + 'sort': False, + 'hidden': False + }, + 'extract': {'column': 'ocr'} + } + +@pytest.fixture() +def date_field_json(): + return { + 'name': 'date', + 'display_name': 'Date', + 'description': 'Date on which the text was written', + 'type': 'date', + 'options': { + 'search': False, + 'filter': 'show', + 'preview': True, + 'visualize': True, + 'sort': True, + 'hidden': False + }, + 'extract': {'column': 'date'} + } + +@pytest.fixture() +def boolean_field_json(): + return { + 'name': 'author_known', + 'display_name': 'Author known', + 'description': 'Whether the author of the text is known', + 'type': 'boolean', + 'options': { + 'search': False, + 'filter': 'show', + 'preview': False, + 'visualize': True, + 'sort': False, + 'hidden': False + }, + 'extract': {'column': 'author_known'} + } + +@pytest.fixture() +def geo_field_json(): + return { + 'name': 'location', + 'display_name': 'Location', + 'description': 'Location where the text was published', + 'type': 'geo_json', + 'options': { + 'search': False, + 'filter': 'none', + 'preview': False, + 'visualize': False, + 'sort': False, + 'hidden': False + }, + 'extract': {'column': 'location'} + } diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 72adff33b..70d68bcb2 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -30,25 +30,8 @@ def test_import(db, json_corpus_data): assert line_field.display_type == 'text_content' -def test_parse_content_field(): - data = { - 'name': 'content', - 'display_name': 'Content', - 'description': 'Bla bla bla', - 'type': 'text_content', - 'language': 'en', - 'options': { - 'search': True, - 'filter': 'none', - 'preview': True, - 'visualize': True, - 'sort': False, - 'hidden': False - }, - 'extract': {'column': 'content'} - } - - field = _parse_field(data) +def test_parse_content_field(content_field_json): + field = _parse_field(content_field_json) assert field.name == 'content' assert field.display_name == 'Content' assert field.display_type == 'text_content' @@ -68,24 +51,8 @@ def test_parse_content_field(): assert field.extract_column == 'content' -def test_parse_keyword_field(): - data = { - 'name': 'author', - 'display_name': 'Author', - 'description': 'Author of the text', - 'type': 'text_metadata', - 'options': { - 'search': True, - 'filter': 'show', - 'preview': True, - 'visualize': True, - 'sort': False, - 'hidden': False - }, - 'extract': {'column': 'author'} - } - - field = _parse_field(data) +def test_parse_keyword_field(keyword_field_json): + field = _parse_field(keyword_field_json) assert field.name == 'author' assert field.display_type == 'keyword' assert field.search_filter['name'] == 'MultipleChoiceFilter' @@ -99,24 +66,8 @@ def test_parse_keyword_field(): assert field.language == '' -def test_parse_int_field(): - data = { - 'name': 'year', - 'display_name': 'Year', - 'description': 'Year in which the text was written', - 'type': 'integer', - 'options': { - 'search': False, - 'filter': 'show', - 'preview': False, - 'visualize': True, - 'sort': True, - 'hidden': False - }, - 'extract': {'column': 'year'} - } - - field = _parse_field(data) +def test_parse_int_field(int_field_json): + field = _parse_field(int_field_json) assert field.name == 'year' assert field.display_type == 'integer' assert field.search_filter['name'] == 'RangeFilter' @@ -131,24 +82,8 @@ def test_parse_int_field(): assert field.searchable == False -def test_parse_float_field(): - data = { - 'name': 'ocr_confidence', - 'display_name': 'OCR confidence', - 'description': 'Confidence level of optical character recognition output', - 'type': 'float', - 'options': { - 'search': False, - 'filter': 'hide', - 'preview': False, - 'visualize': False, - 'sort': False, - 'hidden': False - }, - 'extract': {'column': 'ocr'} - } - - field = _parse_field(data) +def test_parse_float_field(float_field_json): + field = _parse_field(float_field_json) assert field.name == 'ocr_confidence' assert field.display_type == 'float' assert field.search_filter == {} @@ -163,24 +98,8 @@ def test_parse_float_field(): assert field.downloadable == True -def test_parse_date_field(): - data = { - 'name': 'date', - 'display_name': 'Date', - 'description': 'Date on which the text was written', - 'type': 'date', - 'options': { - 'search': False, - 'filter': 'show', - 'preview': True, - 'visualize': True, - 'sort': True, - 'hidden': False - }, - 'extract': {'column': 'date'} - } - - field = _parse_field(data) +def test_parse_date_field(date_field_json): + field = _parse_field(date_field_json) assert field.name == 'date' assert field.display_type == 'date' assert field.search_filter['name'] == 'DateFilter' @@ -194,24 +113,8 @@ def test_parse_date_field(): assert field.searchable == False -def test_parse_boolean_field(): - data = { - 'name': 'author_known', - 'display_name': 'Author known', - 'description': 'Whether the author of the text is known', - 'type': 'boolean', - 'options': { - 'search': False, - 'filter': 'show', - 'preview': False, - 'visualize': True, - 'sort': False, - 'hidden': False - }, - 'extract': {'column': 'author_known'} - } - - field = _parse_field(data) +def test_parse_boolean_field(boolean_field_json): + field = _parse_field(boolean_field_json) assert field.name == 'author_known' assert field.display_type == 'boolean' assert field.search_filter['name'] == 'BooleanFilter' @@ -225,24 +128,8 @@ def test_parse_boolean_field(): assert field.searchable == False -def test_parse_geo_field(): - data = { - 'name': 'location', - 'display_name': 'Location', - 'description': 'Location where the text was published', - 'type': 'geo_json', - 'options': { - 'search': False, - 'filter': 'none', - 'preview': False, - 'visualize': False, - 'sort': False, - 'hidden': False - }, - 'extract': {'column': 'location'} - } - - field = _parse_field(data) +def test_parse_geo_field(geo_field_json): + field = _parse_field(geo_field_json) assert field.name == 'location' assert field.display_type == 'keyword' assert field.search_filter == {} From 5d2f0d5a84141fd2a754ab47b680ad5381e450ea Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 16:22:43 +0200 Subject: [PATCH 33/94] add test exporting different field types --- backend/addcorpus/json_corpora/conftest.py | 19 +++++++++++++++++++ .../json_corpora/tests/test_export.py | 7 ++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/backend/addcorpus/json_corpora/conftest.py b/backend/addcorpus/json_corpora/conftest.py index 05d2549f5..952d73df8 100644 --- a/backend/addcorpus/json_corpora/conftest.py +++ b/backend/addcorpus/json_corpora/conftest.py @@ -126,3 +126,22 @@ def geo_field_json(): }, 'extract': {'column': 'location'} } + +@pytest.fixture( + params=['content', 'keyword', 'int', 'float', 'date', 'boolean', 'geo'] +) +def any_field_json( + request, content_field_json, keyword_field_json, int_field_json, float_field_json, + date_field_json, boolean_field_json, geo_field_json +): + field_type = request.param + funcs = { + 'content': content_field_json, + 'keyword': keyword_field_json, + 'int': int_field_json, + 'float': float_field_json, + 'date': date_field_json, + 'boolean': boolean_field_json, + 'geo': geo_field_json, + } + return funcs[field_type] diff --git a/backend/addcorpus/json_corpora/tests/test_export.py b/backend/addcorpus/json_corpora/tests/test_export.py index 59c379044..5e4640d6a 100644 --- a/backend/addcorpus/json_corpora/tests/test_export.py +++ b/backend/addcorpus/json_corpora/tests/test_export.py @@ -1,7 +1,12 @@ -from addcorpus.json_corpora.export_json import export_json_corpus +from addcorpus.json_corpora.export_json import export_json_corpus, export_json_field from addcorpus.models import Corpus +from addcorpus.json_corpora.import_json import _parse_field def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data): result = export_json_corpus(json_mock_corpus) assert result == json_corpus_data +def test_field_export(any_field_json): + imported = _parse_field(any_field_json) + exported = export_json_field(imported) + assert any_field_json == exported From 8e54aa40ae43679a19a0262ced90d288ff07238c Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 16 Apr 2024 16:32:27 +0200 Subject: [PATCH 34/94] fixes to import/export json --- backend/addcorpus/json_corpora/conftest.py | 2 +- backend/addcorpus/json_corpora/export_json.py | 12 ++++++++---- backend/addcorpus/json_corpora/import_json.py | 4 ++-- backend/addcorpus/json_corpora/tests/test_import.py | 2 +- backend/addcorpus/schemas/corpus.schema.json | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/backend/addcorpus/json_corpora/conftest.py b/backend/addcorpus/json_corpora/conftest.py index 952d73df8..f4da2b964 100644 --- a/backend/addcorpus/json_corpora/conftest.py +++ b/backend/addcorpus/json_corpora/conftest.py @@ -115,7 +115,7 @@ def geo_field_json(): 'name': 'location', 'display_name': 'Location', 'description': 'Location where the text was published', - 'type': 'geo_json', + 'type': 'geo_point', 'options': { 'search': False, 'filter': 'none', diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py index 80426847c..5178590fc 100644 --- a/backend/addcorpus/json_corpora/export_json.py +++ b/backend/addcorpus/json_corpora/export_json.py @@ -52,7 +52,7 @@ def export_corpus_options(configuration: CorpusConfiguration) -> Dict: def export_json_field(field: Field) -> Dict: - return { + data = { 'name': field.name, 'display_name': field.display_name, 'description': field.description, @@ -60,6 +60,9 @@ def export_json_field(field: Field) -> Dict: 'options': export_field_options(field), 'extract': export_field_extract(field) } + if field.language: + data['language'] = field.language + return data def export_field_type(field: Field) -> str: @@ -82,9 +85,10 @@ def export_field_options(field: Field) -> Dict: def export_field_filter(field: Field) -> str: if field.search_filter != {}: return 'show' - if primary_mapping_type(field.es_mapping) == 'text' or field.display_type == 'url': - return 'none' - return 'hide' + filterable_mappings = ['keyword', 'int', 'float', 'date', 'boolean'] + if primary_mapping_type(field.es_mapping) in filterable_mappings and field.display_type != 'url': + return 'hide' + return 'none' def export_field_extract(field: Field) -> Dict: diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py index d0e953288..9b6a68747 100644 --- a/backend/addcorpus/json_corpora/import_json.py +++ b/backend/addcorpus/json_corpora/import_json.py @@ -109,7 +109,7 @@ def _parse_field(field_data: Dict, configuration: Optional[CorpusConfiguration] 'float': _parse_numeric_field, 'date': _parse_date_field, 'boolean': _parse_boolean_field, - 'geo_json': _parse_geo_field, + 'geo_point': _parse_geo_field, } field = parsers[field_type](field, field_data) @@ -269,7 +269,7 @@ def _parse_boolean_field(field: Field, field_data: Dict) -> Field: def _parse_geo_field(field: Field, field_data: Dict) -> Field: - field.display_type = 'keyword' + field.display_type = 'geo_point' field.es_mapping = es_mappings.geo_mapping() field.search_filter = {} return field diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 70d68bcb2..64ddeaf8b 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -131,7 +131,7 @@ def test_parse_boolean_field(boolean_field_json): def test_parse_geo_field(geo_field_json): field = _parse_field(geo_field_json) assert field.name == 'location' - assert field.display_type == 'keyword' + assert field.display_type == 'geo_point' assert field.search_filter == {} assert field.results_overview == False assert field.csv_core == False diff --git a/backend/addcorpus/schemas/corpus.schema.json b/backend/addcorpus/schemas/corpus.schema.json index dda75940c..1c5626bd0 100644 --- a/backend/addcorpus/schemas/corpus.schema.json +++ b/backend/addcorpus/schemas/corpus.schema.json @@ -120,7 +120,7 @@ "float", "date", "boolean", - "geo_json" + "geo_point" ] }, "options": { From c6c5f7708e3157a21b556074700233eb78bc069b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 19 Mar 2024 16:09:42 +0100 Subject: [PATCH 35/94] add times citation docs --- backend/corpora/times/citation/citation.md | 39 ++++++++++++++++++++++ backend/corpora/times/times.py | 1 + 2 files changed, 40 insertions(+) create mode 100644 backend/corpora/times/citation/citation.md diff --git a/backend/corpora/times/citation/citation.md b/backend/corpora/times/citation/citation.md new file mode 100644 index 000000000..e272e5740 --- /dev/null +++ b/backend/corpora/times/citation/citation.md @@ -0,0 +1,39 @@ +If you cite this corpus, we recommend the following format. + +## Citing the complete corpus + +To cite the entire corpus, follow citation guidelines for databases or datasets. + +The [The Times Digital Archive](https://www.gale.com/intl/c/the-times-digital-archive) was originally published by Gale. Your citation should attribute the archive to Gale, but also make it clear that you used the version published on I-analyzer. + +### APA style + +> Gale (2017). *The Times Digital Archive* [data set]. I-analyzer. URL: {{ frontend_url }}/search/times + +See also [database information in references (APA)](https://apastyle.apa.org/style-grammar-guidelines/references/database-information). + +### MLA style + +[MLA guidelines](https://style.mla.org/) recommend against citing a database, and recommend [citing each individual work you use](https://style.mla.org/separate-entries-database-works/). If you want to cite the entire corpus nonetheless, we recommend the following format: + +> Gale. "The Times Digital Archive". *I-analyzer*, 2017, {{ frontend_url }}/search/times + +## Citing a single article + +To cite a single article, follow citation guidelines for newspaper articles. + +You can reference I-analyzer in your citation, though some citation guidelines recommend against this. (The Times corpus on I-analyzer is not publicly accessible, and newspaper articles can be accessed through other archives.) + +If you do reference I-analyzer, you can use the unique link of the article as the URL. + +### APA style + +> Rumbelow, H. (2005, October 19). Rebels cut ID-card majority. *The Times*, 68522, 2. + +See also [newspaper article references](https://apastyle.apa.org/style-grammar-guidelines/references/examples/newspaper-article-references). + +### MLA style + +> Rumbelow, Helen. "Rebels cut ID-card majority.". *I-analyzer*, {{ frontend_url }}/document/times/0FFO-2005-1019-0002-004. Originally published in *The Times*, 19 Oct. 2005 + +See also [citing a newspaper article on a website](https://style.mla.org/newspaper-article-on-web-site/). diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py index b329ba4fb..c05ee90a4 100644 --- a/backend/corpora/times/times.py +++ b/backend/corpora/times/times.py @@ -35,6 +35,7 @@ class Times(XMLCorpusDefinition): image = 'times.jpg' scan_image_type = getattr(settings, 'TIMES_SCAN_IMAGE_TYPE', 'image/png') description_page = 'times.md' + citation_page = 'citation.md' languages = ['en'] category = 'periodical' From 81046f17e510731710034139de309e1b89bad2d3 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 20 Mar 2024 13:39:53 +0100 Subject: [PATCH 36/94] add delpher citation docs --- .../citation/citation_public.md | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 backend/corpora/dutchnewspapers/citation/citation_public.md diff --git a/backend/corpora/dutchnewspapers/citation/citation_public.md b/backend/corpora/dutchnewspapers/citation/citation_public.md new file mode 100644 index 000000000..bef4bf6ad --- /dev/null +++ b/backend/corpora/dutchnewspapers/citation/citation_public.md @@ -0,0 +1,43 @@ +If you cite this corpus, we recommend the following format. + +## Citing the whole corpus + +To cite the entire corpus, follow citation guidelines for databases or datasets. + +The [Delpher corpus](https://www.delpher.nl/) is developed by the KB (Dutch Royal Library). Your citation should attribute the archive the KB, but also make it clear that you used the version published on I-analyzer. + +### APA style + +> KB, Nationale Biliotheek (2018). *Delpher* [data set]. I-analyzer. {{ frontend_url }}/search/dutchnewspapers-public + +### MLA style + +[MLA guidelines](https://style.mla.org/) recommend against citing a database, and recommend [citing each individual work you use](https://style.mla.org/separate-entries-database-works/). If you want to cite the entire corpus nonetheless, we recommend the following format: + +> KB, Nationale Bibliotheek. "Delpher". *I-analyzer*, 2018, {{ frontend_url }}/search/dutchnewspapers-public + +## Citing a single article + +To cite a single article, follow citation guidelines for newspaper articles. You can provide the URL to the article on I-analyzer in your citation. + +Most articles on I-analyzer also include the URL of the article on Delper (this starts with `http://resolver.kb.nl`). These URLs are specifically created to be a stable reference point, so you may prefer to us that link instead. + +### APA style + +> Beurs- en Marktberigten (1851, January 30). *Rotterdamsche Courant*, 13. {{ frontend_url }}/document/dutchnewspapers-public/ddd:010979847:mpeg21:a0010 + +You can also refer directly to the version on Delpher: + +> Beurs- en Marktberigten (1851, January 30). *Rotterdamsche Courant*, 13. http://resolver.kb.nl/resolve?urn=ddd:010979847:mpeg21:a0010 + +See also [newspaper article references](https://apastyle.apa.org/style-grammar-guidelines/references/examples/newspaper-article-references). + +### MLA style + +> "Beurs- en Marktberigten.". *I-analyzer*, {{ frontend_url }}/document/dutchnewspapers-public/ddd:010979847:mpeg21:a0010. Originally published in *Rotterdamsche Courant*, 30 Jan. 1851. + +You can also refer directly to the version on Delpher: + +> "Beurs- en Marktberigten.". *Delpher*, http://resolver.kb.nl/resolve?urn=ddd:010979847:mpeg21:a0010. Originally published in *Rotterdamsche Courant*, 30 Jan. 1851. + +See also [citing a newspaper article on a website](https://style.mla.org/newspaper-article-on-web-site/). From eb00ea5c0c346d70c020d28ef81ff7b30b8702a9 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 18 Apr 2024 12:07:53 +0200 Subject: [PATCH 37/94] add citation page to corpus definition --- backend/corpora/dutchnewspapers/dutchnewspapers_public.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index a7962df50..eaf989e14 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -37,6 +37,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition): image = 'dutchnewspapers.jpg' languages = ['nl'] category = 'periodical' + citation_page = 'citation_public.md' @property def es_settings(self): From ef8fcf0c58e9923c63637bc7ec3455f7e9878939 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 21:54:44 +0000 Subject: [PATCH 38/94] Bump tqdm from 4.66.1 to 4.66.3 in /backend Bumps [tqdm](https://github.com/tqdm/tqdm) from 4.66.1 to 4.66.3. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.66.1...v4.66.3) --- updated-dependencies: - dependency-name: tqdm dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 994e23318..4c990db2c 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -381,7 +381,7 @@ tornado==6.3.3 # via # django-livereload-server # flower -tqdm==4.66.1 +tqdm==4.66.3 # via # -r requirements.in # nltk From 7ac19d79e16841ce2f2bb8f3f53a282ac5d5baa6 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Mon, 6 May 2024 16:29:07 +0200 Subject: [PATCH 39/94] Use database corpus in indexing --- backend/addcorpus/python_corpora/corpus.py | 33 ++--- .../addcorpus/python_corpora/save_corpus.py | 3 +- backend/conftest.py | 32 +++-- .../jewishmigration/test_jewishmigration.py | 46 ++++-- backend/es/conftest.py | 2 +- backend/es/es_alias.py | 13 +- backend/es/es_index.py | 134 ++++++++++++------ backend/es/tests/test_alias.py | 42 +++--- backend/es/tests/test_es_index.py | 39 +++-- 9 files changed, 214 insertions(+), 130 deletions(-) diff --git a/backend/addcorpus/python_corpora/corpus.py b/backend/addcorpus/python_corpora/corpus.py index 2ce7fe663..d0602b0bc 100644 --- a/backend/addcorpus/python_corpora/corpus.py +++ b/backend/addcorpus/python_corpora/corpus.py @@ -20,6 +20,7 @@ logger = logging.getLogger('indexing') + class CorpusDefinition(Reader): ''' Subclasses of this class define corpora and their documents by specifying: @@ -159,7 +160,7 @@ def word_models_present(self): ''' if word models are present for this corpus ''' - return self.word_model_path != None and isdir(self.word_model_path) + return self.word_model_path is not None and isdir(self.word_model_path) @property def new_highlight(self): @@ -171,7 +172,7 @@ def new_highlight(self): ''' try: highlight_corpora = settings.NEW_HIGHLIGHT_CORPORA - except: + except Exception: return False return self.title in highlight_corpora @@ -242,19 +243,6 @@ def request_media(self, document, corpus_name): ''' return {'media': None, 'info': None} - def es_mapping(self): - ''' - Create the ElasticSearch mapping for the fields of this corpus. May be - passed to the body of an ElasticSearch index creation request. - ''' - return { - 'properties': { - field.name: field.es_mapping - for field in self.fields - if field.es_mapping and not field.skip - } - } - def sources(self, start=datetime.min, end=datetime.max): ''' Obtain source files for the corpus, relevant to the given timespan. @@ -310,26 +298,31 @@ def __init__(self): ''' self.fields = [] + class XMLCorpusDefinition(CorpusDefinition, XMLReader): ''' An XMLCorpus is any corpus that extracts its data from XML sources. ''' + class HTMLCorpusDefinition(CorpusDefinition, HTMLReader): ''' An HTMLCorpus is any corpus that extracts its data from HTML sources. ''' + class CSVCorpusDefinition(CorpusDefinition, CSVReader): ''' An CSVCorpus is any corpus that extracts its data from CSV sources. ''' + class XLSXCorpusDefinition(CorpusDefinition, XLSXReader): ''' An CSVCorpus is any corpus that extracts its data from an XLSX spreadsheet. ''' + class JSONCorpusDefinition(CorpusDefinition): ''' Corpus definition for json encoded data. @@ -339,7 +332,7 @@ def source2dicts(self, source, *nargs, **kwargs): self._reject_extractors(extract.XML, extract.CSV) field_dict = { - field.name: field.extractor.apply(source, *nargs, **kwargs) + field.name: field.extractor.apply(source, *nargs, **kwargs) for field in self.fields } @@ -347,6 +340,7 @@ def source2dicts(self, source, *nargs, **kwargs): # Fields ###################################################################### + class FieldDefinition(Field): ''' Definition for a single field in a corpus. @@ -420,17 +414,16 @@ def __init__(self, self.language = language self.hidden = not indexed or hidden - self.sortable = sortable if sortable != None else \ + self.sortable = sortable if sortable is not None else \ not hidden and indexed and \ mapping_type in ['integer', 'float', 'date'] - # Fields are searchable if they are not hidden and if they are mapped as 'text'. # Keyword fields without a filter are also searchable. - self.searchable = searchable if searchable != None else \ + self.searchable = searchable if searchable is not None else \ not hidden and indexed and \ ((mapping_type == 'text') or - (mapping_type == 'keyword' and self.search_filter == None)) + (mapping_type == 'keyword' and self.search_filter is None)) # Add back reference to field in filter self.downloadable = downloadable diff --git a/backend/addcorpus/python_corpora/save_corpus.py b/backend/addcorpus/python_corpora/save_corpus.py index 1358aaf74..cbdde2b16 100644 --- a/backend/addcorpus/python_corpora/save_corpus.py +++ b/backend/addcorpus/python_corpora/save_corpus.py @@ -153,7 +153,8 @@ def _save_corpus_documentation(corpus_definition: CorpusDefinition, configuratio if pages.exists(): pages.delete() -def _prepare_for_import(corpus): + +def _prepare_for_import(corpus: Corpus): corpus.has_python_definition = True corpus.active = False corpus.save() diff --git a/backend/conftest.py b/backend/conftest.py index e128a214f..e32787974 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -8,7 +8,6 @@ from addcorpus.json_corpora.import_json import import_json_corpus from ianalyzer.elasticsearch import elasticsearch -from addcorpus.python_corpora.load_corpus import load_corpus_definition from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora from es import es_index as index from django.conf import settings @@ -153,36 +152,36 @@ def test_index_cleanup(es_client: Elasticsearch): def _index_test_corpus(es_client: Elasticsearch, corpus_name: str): - corpus = load_corpus_definition(corpus_name) + corpus = Corpus.objects.get(name=corpus_name) - if not es_client.indices.exists(index=corpus.es_index): - index.create(es_client, corpus, False, True, False) - index.populate(es_client, corpus_name, corpus) + if not es_client.indices.exists(index=corpus.configuration.es_index): + index.create(es_client, corpus, clear=True) + index.populate(es_client, corpus) # ES is "near real time", so give it a second before we start searching the index sleep(2) @pytest.fixture() -def index_basic_mock_corpus(es_client: Elasticsearch, basic_mock_corpus: str, test_index_cleanup): +def index_basic_mock_corpus(db, es_client: Elasticsearch, basic_mock_corpus: str, test_index_cleanup): _index_test_corpus(es_client, basic_mock_corpus) @pytest.fixture() -def index_small_mock_corpus(es_client: Elasticsearch, small_mock_corpus: str, test_index_cleanup): +def index_small_mock_corpus(db, es_client: Elasticsearch, small_mock_corpus: str, test_index_cleanup): _index_test_corpus(es_client, small_mock_corpus) @pytest.fixture() -def index_large_mock_corpus(es_client: Elasticsearch, large_mock_corpus: str, test_index_cleanup): +def index_large_mock_corpus(db, es_client: Elasticsearch, large_mock_corpus: str, test_index_cleanup): _index_test_corpus(es_client, large_mock_corpus) @pytest.fixture() -def index_ml_mock_corpus(es_client: Elasticsearch, ml_mock_corpus: str, test_index_cleanup): +def index_ml_mock_corpus(db, es_client: Elasticsearch, ml_mock_corpus: str, test_index_cleanup): _index_test_corpus(es_client, ml_mock_corpus) @pytest.fixture() -def index_tag_mock_corpus(es_client: Elasticsearch, tag_mock_corpus: str, test_index_cleanup): +def index_tag_mock_corpus(db, es_client: Elasticsearch, tag_mock_corpus: str, test_index_cleanup): _index_test_corpus(es_client, tag_mock_corpus) @@ -201,6 +200,15 @@ def json_corpus_data(): @pytest.fixture(autouse=True) -def json_mock_corpus(db, json_corpus_data): +def json_mock_corpus(db, json_corpus_data, mock_corpus_dir): # add json mock corpora to the database at the start of each test - return import_json_corpus(json_corpus_data) + corpus = import_json_corpus(json_corpus_data) + corpus.configuration.data_directory = os.path.join( + mock_corpus_dir, 'basic', 'source_data' + ) + return corpus + + +@pytest.fixture() +def mock_corpus_dir(): + return os.path.join(settings.BASE_DIR, 'corpora_test') diff --git a/backend/corpora/jewishmigration/test_jewishmigration.py b/backend/corpora/jewishmigration/test_jewishmigration.py index e35d144c3..635081802 100644 --- a/backend/corpora/jewishmigration/test_jewishmigration.py +++ b/backend/corpora/jewishmigration/test_jewishmigration.py @@ -6,9 +6,12 @@ import requests from addcorpus.es_mappings import geo_mapping +from addcorpus.models import Corpus from addcorpus.python_corpora.load_corpus import load_corpus_definition +from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora from es import es_index + here = os.path.abspath(os.path.dirname(__file__)) class MockResponse(object): @@ -129,14 +132,19 @@ def mock_get(_dummy_path): } @pytest.fixture -def jm_corpus(settings): +def jm_corpus_settings(settings): settings.CORPORA = { 'jewishmigration': os.path.join(here, 'jewishmigration.py') } - settings.JMIG_DATA = 'https://example.com' + settings.JMIG_DATA = None settings.JMIG_INDEX = 'test-jewishmigration' - corpus_definition = load_corpus_definition('jewishmigration') - return corpus_definition + + +@pytest.fixture +def jm_corpus(jm_corpus_settings): + load_and_save_all_corpora() + corpus = Corpus.objects.get(name='jewishmigration') + return corpus @pytest.fixture @@ -152,17 +160,24 @@ def jm_client(es_client, jm_corpus): sleep(1) yield es_client # delete index when done - es_client.indices.delete(index=jm_corpus.es_index) + es_client.indices.delete(index=jm_corpus.configuration.es_index) + + +def test_jm_validation(db, jm_corpus): + assert jm_corpus + assert jm_corpus.configuration_obj + assert jm_corpus.active def test_geofield(jm_client, jm_corpus): - assert jm_client.indices.get(index=jm_corpus.es_index) + es_index = jm_corpus.configuration.es_index + assert jm_client.indices.get(index=es_index) field_mapping = jm_client.indices.get_field_mapping( - fields='coordinates', index=jm_corpus.es_index) - assert field_mapping[jm_corpus.es_index]['mappings']['coordinates']['mapping']['coordinates'] == geo_mapping() + fields='coordinates', index=es_index) + assert field_mapping[es_index]['mappings']['coordinates']['mapping']['coordinates'] == geo_mapping() geo_data = 'gibberish' try: - jm_client.create(index=jm_corpus.es_index, id=1, + jm_client.create(index=es_index, id=1, document={'coordinates': geo_data}) except Exception as e: assert type(e) == BadRequestError @@ -173,9 +188,9 @@ def test_geofield(jm_client, jm_corpus): 3.0 #latitude north/south ] } - jm_client.create(index=jm_corpus.es_index, id=1, + jm_client.create(index=es_index, id=1, document={'coordinates': geo_data}) - document = jm_client.get(index=jm_corpus.es_index, id=1) + document = jm_client.get(index=es_index, id=1) assert document['_source']['coordinates'] == geo_data query = { "geo_bounding_box": { @@ -193,15 +208,16 @@ def test_geofield(jm_client, jm_corpus): } # wait for the indexing operation to be finished sleep(1) - results = jm_client.search(index=jm_corpus.es_index, query=query) + results = jm_client.search(index=es_index, query=query) assert results['hits']['total']['value'] == 1 def test_data_from_request(jm_corpus, monkeypatch): monkeypatch.setattr(requests, "get", mock_get) - sources = jm_corpus.sources( - start=jm_corpus.min_date, end=jm_corpus.max_date) - documents = list(jm_corpus.documents(sources)) + corpus_def = load_corpus_definition(jm_corpus.name) + sources = corpus_def.sources( + start=corpus_def.min_date, end=corpus_def.max_date) + documents = list(corpus_def.documents(sources)) assert len(documents) == 3 reference_document = documents[0] for key in EXPECTED_DOCUMENT.keys(): diff --git a/backend/es/conftest.py b/backend/es/conftest.py index ac99c36a7..c9272b917 100644 --- a/backend/es/conftest.py +++ b/backend/es/conftest.py @@ -37,7 +37,7 @@ def es_alias_client(es_client, mock_corpus): Returns an elastic search client for the mock corpus. """ # add data from mock corpus - corpus = load_corpus_definition(mock_corpus) + corpus = Corpus.objects.get(name=mock_corpus) es_index.create(es_client, corpus, add=False, clear=True, prod=True) # create ianalyzer-times-1 index es_client.indices.create(index='times-test-2') es_client.indices.create(index='times-test-bla-3') diff --git a/backend/es/es_alias.py b/backend/es/es_alias.py index efcb1e641..2d3e353bf 100644 --- a/backend/es/es_alias.py +++ b/backend/es/es_alias.py @@ -1,20 +1,25 @@ #!/usr/bin/env python3 import re +from addcorpus.models import Corpus from ianalyzer.elasticsearch import elasticsearch import logging logger = logging.getLogger('indexing') -def alias(corpus_name, corpus_definition, clean=False): +def alias(corpus: Corpus, clean=False): ''' Script to create, update and remove aliases from ES ''' + corpus_config = corpus.configuration + corpus_name = corpus.name + index_name = corpus_config.es_index + index_alias = corpus_config.es_alias client = elasticsearch(corpus_name) - alias = corpus_definition.es_alias if corpus_definition.es_alias else corpus_definition.es_index - indices = client.indices.get(index='{}-*'.format(corpus_definition.es_index)) + alias = index_alias if index_alias else index_name + indices = client.indices.get(index='{}-*'.format(index_name)) highest_version = get_highest_version_number(indices, alias) actions = [] @@ -36,7 +41,7 @@ def alias(corpus_name, corpus_definition, clean=False): if is_highest_version and not is_aliased: logger.info('Adding alias `{}` for index `{}`'.format(alias, index_name)) actions.append( - {'add': {'index': index_name, 'alias': alias }}) + {'add': {'index': index_name, 'alias': alias}}) elif is_highest_version and is_aliased: logger.info('Alias `{}` already exists for `{}`, skipping alias creation'.format( alias, index_name)) diff --git a/backend/es/es_index.py b/backend/es/es_index.py index 75bbd211b..d7d6abbb9 100644 --- a/backend/es/es_index.py +++ b/backend/es/es_index.py @@ -5,23 +5,59 @@ ''' import sys +from typing import Dict, Optional +from elasticsearch import Elasticsearch import elasticsearch.helpers as es_helpers from elasticsearch.exceptions import RequestError from django.conf import settings +from addcorpus.es_settings import es_settings +from addcorpus.models import Corpus, CorpusConfiguration +from addcorpus.python_corpora.load_corpus import load_corpus_definition +from addcorpus.reader import make_reader from ianalyzer.elasticsearch import elasticsearch from .es_alias import alias, get_new_version_number +import datetime import logging logger = logging.getLogger('indexing') -def create(client, corpus_definition, add, clear, prod): +def _make_es_settings(corpus: Corpus) -> Dict: + if corpus.has_python_definition: + corpus_def = load_corpus_definition(corpus.name) + return corpus_def.es_settings + return es_settings( + languages=corpus.configuration.languages, + stemming_analysis=True, + stopword_analysis=True, + ) + + +def _make_es_mapping(corpus_configuration: CorpusConfiguration) -> Dict: + ''' + Create the ElasticSearch mapping for the fields of this corpus. May be + passed to the body of an ElasticSearch index creation request. + ''' + return { + 'properties': { + field.name: field.es_mapping + for field in corpus_configuration.fields.all() + if field.es_mapping and field.indexed + } + } + + +def create(client: Elasticsearch, corpus: Corpus, add: bool = False, clear: bool = False, prod: bool = False): ''' Initialise an ElasticSearch index. ''' + corpus_config = corpus.configuration + index_name = corpus_config.es_index + es_mapping = _make_es_mapping(corpus_config) + if add: # we add document to existing index - skip creation. return None @@ -29,64 +65,65 @@ def create(client, corpus_definition, add, clear, prod): if clear: logger.info('Attempting to clean old index...') client.indices.delete( - index=corpus_definition.es_index, ignore=[400, 404]) + index=index_name, ignore=[400, 404]) - settings = corpus_definition.es_settings + settings = _make_es_settings(corpus) if prod: logger.info('Using a versioned index name') - alias = corpus_definition.es_alias if corpus_definition.es_alias else corpus_definition.es_index - corpus_definition.es_index = "{}-{}".format( - corpus_definition.es_index, get_new_version_number(client, alias, corpus_definition.es_index)) - if client.indices.exists(index=corpus_definition.es_index): + alias = corpus_config.es_alias if corpus_config.es_alias else index_name + index_name = "{}-{}".format( + index_name, get_new_version_number(client, alias, index_name)) + if client.indices.exists(index=index_name): logger.error('Index `{}` already exists. Do you need to add an alias for it or perhaps delete it?'.format( - corpus_definition.es_index)) + index_name)) sys.exit(1) logger.info('Adding prod settings to index') settings['index'].update({ - 'number_of_replicas' : 0, + 'number_of_replicas': 0, 'number_of_shards': 5 }) logger.info('Attempting to create index `{}`...'.format( - corpus_definition.es_index)) + index_name)) try: client.indices.create( - index=corpus_definition.es_index, + index=index_name, settings=settings, - mappings=corpus_definition.es_mapping(), + mappings=es_mapping, ) except RequestError as e: - if not 'already_exists' in e.error: + if 'already_exists' not in e.error: # ignore that the index already exist, # raise any other errors. raise -def populate(client, corpus_name, corpus_definition, start=None, end=None): +def populate(client: Elasticsearch, corpus: Corpus, start=None, end=None): ''' Populate an ElasticSearch index from the corpus' source files. ''' + corpus_config = corpus.configuration + corpus_name = corpus.name + index_name = corpus_config.es_index + reader = make_reader(corpus) logger.info('Attempting to populate index...') # Obtain source documents - files = corpus_definition.sources( - start or corpus_definition.min_date, - end or corpus_definition.max_date) - docs = corpus_definition.documents(files) - - if not type(corpus_definition.es_index)==str: - raise Exception('es_index is not a string') + files = reader.sources( + start=start or corpus_config.min_date, + end=end or corpus_config.max_date) + docs = reader.documents(files) # Each source document is decorated as an indexing operation, so that it # can be sent to ElasticSearch in bulk actions = ( { '_op_type': 'index', - '_index': corpus_definition.es_index, - '_id' : doc.get('id'), + '_index': index_name, + '_id': doc.get('id'), '_source': doc } for doc in docs ) @@ -103,44 +140,57 @@ def populate(client, corpus_name, corpus_definition, start=None, end=None): if not success: logger.error(f"FAILED INDEX: {info}") -def perform_indexing(corpus_name, corpus_definition, start, end, mappings_only, add, clear, prod, rollover): - logger.info('Started indexing `{}` from {} to {}...'.format( - corpus_definition.es_index, - start.strftime('%Y-%m-%d'), - end.strftime('%Y-%m-%d') + +def perform_indexing( + corpus: Corpus, + start: Optional[datetime.date] = None, + end: Optional[datetime.date] = None, + mappings_only: bool = False, + add: bool = False, + clear: bool = False, + prod: bool = False, + rollover: bool = False +): + corpus_config = corpus.configuration + corpus_name = corpus.name + index_name = corpus_config.es_index + + logger.info('Started indexing `{}` on index {}'.format( + corpus_name, + index_name )) if rollover and not prod: - logger.info('rollover flag is set but prod flag not set -- no effect') + logger.warning( + 'rollover flag is set but prod flag not set -- no effect') # Create and populate the ES index client = elasticsearch(corpus_name) - logger.info( - vars(client).get('_max_retries')) + logger.info('max_retries: {}'.format(vars(client).get('_max_retries'))) - logger.info( - vars(client).get('_retry_on_timeout') + logger.info('retry on timeout: {}'.format( + vars(client).get('_retry_on_timeout')) ) - create(client, corpus_definition, add, clear, prod) + create(client, corpus, add, clear, prod) client.cluster.health(wait_for_status='yellow') if mappings_only: - logger.info('Created index `{}` with mappings only.'.format(corpus_definition.es_index)) + logger.info('Created index `{}` with mappings only.'.format(index_name)) return - populate(client, corpus_name, corpus_definition, start=start, end=end) + populate(client, corpus, start=start, end=end) - logger.info('Finished indexing `{}`.'.format(corpus_definition.es_index)) + logger.info('Finished indexing `{}` to index `{}`.'.format( + corpus_name, index_name)) if prod: logger.info('Updating settings for index `{}`'.format( - corpus_definition.es_index)) + index_name)) client.indices.put_settings( settings={'number_of_replicas': 1}, - index=corpus_definition.es_index + index=index_name ) if rollover: logger.info('Adjusting alias for index `{}`'.format( - corpus_definition.es_index)) - alias(corpus_name, corpus_definition) # not deleting old index, so we can roll back - + index_name)) + alias(corpus) # not deleting old index, so we can roll back diff --git a/backend/es/tests/test_alias.py b/backend/es/tests/test_alias.py index 0d83acbd9..6d21f562b 100644 --- a/backend/es/tests/test_alias.py +++ b/backend/es/tests/test_alias.py @@ -1,38 +1,30 @@ -import pytest - +from addcorpus.models import Corpus from es.es_alias import alias, get_highest_version_number -from es.es_index import create -from addcorpus.python_corpora.load_corpus import load_corpus_definition -def test_alias(es_alias_client): - corpus_definition = load_corpus_definition('times') - assert corpus_definition.es_index == 'times-test' - alias('times', corpus_definition) # create an alias ianalyzer-times - res = es_alias_client.indices.get_alias(name=corpus_definition.es_index) +def test_alias(db, es_alias_client): + corpus = Corpus.objects.get(name='times') + assert corpus.configuration.es_index == 'times-test' + alias(corpus) # create an alias ianalyzer-times + res = es_alias_client.indices.get_alias(name=corpus.configuration.es_index) assert res.get('times-test-2') is not None + def test_alias_with_clean(es_alias_client): - corpus_definition = load_corpus_definition('times') - indices = es_alias_client.indices.get(index='{}-*'.format(corpus_definition.es_index)) + corpus = Corpus.objects.get(name='times') + indices = es_alias_client.indices.get( + index='{}-*'.format(corpus.configuration.es_index)) assert 'times-test-1' in list(indices.keys()) - alias('times', corpus_definition, True) - indices = es_alias_client.indices.get(index='{}-*'.format(corpus_definition.es_index)) + alias(corpus, True) + indices = es_alias_client.indices.get( + index='{}-*'.format(corpus.configuration.es_index)) assert 'times-test-1' not in list(indices.keys()) + def test_highest_version_number(es_alias_client): - corpus_definition = load_corpus_definition('times') - indices = es_alias_client.indices.get(index='{}-*'.format(corpus_definition.es_index)) + corpus = Corpus.objects.get(name='times') + indices = es_alias_client.indices.get( + index='{}-*'.format(corpus.configuration.es_index)) current_index = 'times-test' num = get_highest_version_number(indices, current_index) assert num == 2 - - - - - - - - - - diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py index 6f69f3611..31f17ab4d 100644 --- a/backend/es/tests/test_es_index.py +++ b/backend/es/tests/test_es_index.py @@ -2,10 +2,11 @@ from datetime import datetime from time import sleep +from addcorpus.models import Corpus from es.es_index import perform_indexing -start = datetime.strptime('1970-01-01','%Y-%m-%d') -end = datetime.strptime('1970-12-31','%Y-%m-%d') +START = datetime.strptime('1970-01-01', '%Y-%m-%d') +END = datetime.strptime('1970-12-31', '%Y-%m-%d') def mock_client(es_index_client): @@ -14,42 +15,60 @@ def mock_client(es_index_client): @pytest.mark.parametrize("prod, name, shards", [(True, "times-test-1", '5'), (False, "times-test", '1')]) def test_prod_flag(mock_corpus, es_index_client, corpus_definition, prod, name, shards): + corpus = Corpus.objects.get(name=mock_corpus) perform_indexing( - mock_corpus, corpus_definition, start, end, + corpus, START, END, mappings_only=True, add=False, clear=False, prod=prod, rollover=False) - + assert es_index_client.indices.exists(index=name) - assert es_index_client.indices.get_settings(index=name).get(name)['settings']['index']['number_of_shards'] == shards + assert es_index_client.indices.get_settings(index=name).get( + name)['settings']['index']['number_of_shards'] == shards + @pytest.mark.parametrize("mappings_only, expected", [(False, 2), (True, 0)]) def test_mappings_only_flag(mock_corpus, es_index_client, corpus_definition, mappings_only, expected): + corpus = Corpus.objects.get(name=mock_corpus) perform_indexing( - mock_corpus, corpus_definition, start, end, + corpus, START, END, mappings_only=mappings_only, add=False, clear=False, prod=False, rollover=False) sleep(1) res = es_index_client.count(index='times-test*') assert res.get('count') == expected -def test_add_clear(mock_corpus, es_index_client, corpus_definition): + +def test_add_clear(db, mock_corpus, es_index_client): + corpus = Corpus.objects.get(name=mock_corpus) perform_indexing( - mock_corpus, corpus_definition, start, end, + corpus, START, END, mappings_only=True, add=False, clear=False, prod=False, rollover=False ) res = es_index_client.count(index='times-test*') assert res.get('count') == 0 perform_indexing( - mock_corpus, corpus_definition, start, end, + corpus, START, END, mappings_only=False, add=True, clear=False, prod=False, rollover=False ) sleep(1) res = es_index_client.count(index='times-test*') assert res.get('count') == 2 perform_indexing( - mock_corpus, corpus_definition, start, end, + corpus, START, END, mappings_only=True, add=False, clear=True, prod=False, rollover=False ) res = es_index_client.count(index='times-test*') assert res.get('count') == 0 + def test_mismatch_corpus_index_names(mock_corpus, corpus_definition, es_index_client): assert corpus_definition.es_index != mock_corpus + + +def test_db_only_corpus(json_mock_corpus, es_client, test_index_cleanup): + json_mock_corpus.name = 'example2' + json_mock_corpus.configuration.es_index = 'test-example2' + perform_indexing( + corpus=json_mock_corpus, + ) + sleep(1) + res = es_client.count(index=json_mock_corpus.configuration.es_index) + assert res.get('count') == 10 From 556cf394e2479edcd46ee79d26635f5cc67e12ee Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Mon, 6 May 2024 16:32:08 +0200 Subject: [PATCH 40/94] Don't change name in json corpus test --- backend/es/tests/test_es_index.py | 2 -- frontend/scroll-position-restoration-local | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) create mode 160000 frontend/scroll-position-restoration-local diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py index 31f17ab4d..f877e7fbc 100644 --- a/backend/es/tests/test_es_index.py +++ b/backend/es/tests/test_es_index.py @@ -64,8 +64,6 @@ def test_mismatch_corpus_index_names(mock_corpus, corpus_definition, es_index_cl def test_db_only_corpus(json_mock_corpus, es_client, test_index_cleanup): - json_mock_corpus.name = 'example2' - json_mock_corpus.configuration.es_index = 'test-example2' perform_indexing( corpus=json_mock_corpus, ) diff --git a/frontend/scroll-position-restoration-local b/frontend/scroll-position-restoration-local new file mode 160000 index 000000000..39fcb7891 --- /dev/null +++ b/frontend/scroll-position-restoration-local @@ -0,0 +1 @@ +Subproject commit 39fcb7891196f42ae4a3a99054ea3e76e532ad10 From 1d3f83491cb418c5077ce835c0e714d0f8fb3db4 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Mon, 6 May 2024 16:52:06 +0200 Subject: [PATCH 41/94] Use corpus object in index command --- backend/addcorpus/python_corpora/save_corpus.py | 3 ++- backend/es/management/commands/index.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/addcorpus/python_corpora/save_corpus.py b/backend/addcorpus/python_corpora/save_corpus.py index cbdde2b16..214778abc 100644 --- a/backend/addcorpus/python_corpora/save_corpus.py +++ b/backend/addcorpus/python_corpora/save_corpus.py @@ -192,8 +192,9 @@ def _save_or_skip_corpus(corpus_name, corpus_definition, verbose=False, stdout=s _save_corpus_configuration(corpus, corpus_definition) _activate_if_ready(corpus) if verbose: - print(f'Saved corpus: {corpus_name}', file=stdout) + print(f'Saved corpus: {corpus_name}', file=stdout) except Exception as e: + transaction.rollback() print(f'Failed saving corpus: {corpus_name}', file=stderr) print(f'Error: {e}', file=stderr) diff --git a/backend/es/management/commands/index.py b/backend/es/management/commands/index.py index f3adebc96..36a965638 100644 --- a/backend/es/management/commands/index.py +++ b/backend/es/management/commands/index.py @@ -72,7 +72,7 @@ def add_arguments(self, parser): (Only applicable in combination with --prod)''' ) - def handle(self, corpus, start = None, end = None, add=False, delete=False, update=False, mappings_only=False, prod=False, rollover=False, **options): + def handle(self, corpus, start=None, end=None, add=False, delete=False, update=False, mappings_only=False, prod=False, rollover=False, **options): corpus_object = self._corpus_object(corpus) self._validate(corpus_object) @@ -117,7 +117,8 @@ def handle(self, corpus, start = None, end = None, add=False, delete=False, upda logging.critical(e) raise else: - perform_indexing(corpus, corpus_definition, start_index, end_index, mappings_only, add, delete, prod, rollover) + perform_indexing(corpus_object, start_index, end_index, + mappings_only, add, delete, prod, rollover) def _corpus_object(self, corpus_name): load_all_corpus_definitions() From 8f6e2cc31372a24e26dfe7911b145406df614532 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Mon, 6 May 2024 17:05:30 +0200 Subject: [PATCH 42/94] Use corpus object in alias command --- backend/es/management/commands/alias.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/es/management/commands/alias.py b/backend/es/management/commands/alias.py index 99a3cbabf..df6659a7b 100644 --- a/backend/es/management/commands/alias.py +++ b/backend/es/management/commands/alias.py @@ -1,5 +1,6 @@ from django.core.management import BaseCommand +from addcorpus.models import Corpus from addcorpus.python_corpora.load_corpus import load_corpus_definition from es.es_alias import alias @@ -25,5 +26,5 @@ def add_arguments(self, parser): ) def handle(self, corpus, clean=False, **options): - corpus_definition = load_corpus_definition(corpus) - alias(corpus, corpus_definition, clean) + corpus_obj = Corpus.objects.get(name=corpus) + alias(corpus_obj, clean) From ceb729d402eb6a4779d0cd72d3cb3d085fe1034f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 6 May 2024 17:25:30 +0200 Subject: [PATCH 43/94] fix template reference in wordcloud close #1559 --- .../src/app/visualization/wordcloud/wordcloud.component.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.html b/frontend/src/app/visualization/wordcloud/wordcloud.component.html index 7ad2c2112..448f54ff2 100644 --- a/frontend/src/app/visualization/wordcloud/wordcloud.component.html +++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.html @@ -4,7 +4,7 @@
- +
From 880a323c08c63f59c94a17b70c6702a28cee61e4 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 6 May 2024 17:27:24 +0200 Subject: [PATCH 44/94] fix wordcloud for dynamic language fields close #1558 --- backend/visualization/wordcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py index 397839bda..709a6092b 100644 --- a/backend/visualization/wordcloud.py +++ b/backend/visualization/wordcloud.py @@ -8,7 +8,7 @@ def field_stopwords(corpus_name, field_name): corpus = Corpus.objects.get(name=corpus_name) field = corpus.configuration.fields.get(name=field_name) - if field.language and field.language is not 'dynamic': + if field.language and field.language != 'dynamic': return get_nltk_stopwords(field.language) else: return [] From 55a1e97d62b44991052381c7e632a2bcf79a89a4 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 7 May 2024 14:16:17 +0200 Subject: [PATCH 45/94] Fix indexing tests --- backend/addcorpus/python_corpora/save_corpus.py | 1 - .../corpora/jewishmigration/jewishmigration.py | 16 +++++++++++----- .../jewishmigration/test_jewishmigration.py | 1 + backend/es/tests/test_es_index.py | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/backend/addcorpus/python_corpora/save_corpus.py b/backend/addcorpus/python_corpora/save_corpus.py index 214778abc..13f3ff20b 100644 --- a/backend/addcorpus/python_corpora/save_corpus.py +++ b/backend/addcorpus/python_corpora/save_corpus.py @@ -194,7 +194,6 @@ def _save_or_skip_corpus(corpus_name, corpus_definition, verbose=False, stdout=s if verbose: print(f'Saved corpus: {corpus_name}', file=stdout) except Exception as e: - transaction.rollback() print(f'Failed saving corpus: {corpus_name}', file=stderr) print(f'Error: {e}', file=stderr) diff --git a/backend/corpora/jewishmigration/jewishmigration.py b/backend/corpora/jewishmigration/jewishmigration.py index e413728d2..24fa4486e 100644 --- a/backend/corpora/jewishmigration/jewishmigration.py +++ b/backend/corpora/jewishmigration/jewishmigration.py @@ -1,5 +1,6 @@ from datetime import datetime import json +import logging from django.conf import settings import langcodes @@ -46,8 +47,10 @@ class JewishMigration(PeacePortal, JSONCorpusDefinition): description = "Inscriptions and book entries documenting Jewish settlements in the Mediterranean" min_date = datetime(year=1, month=1, day=1) max_date = datetime(year=1800, month=12, day=31) - data_directory = getattr(settings, 'JMIG_DATA', - 'localhost:8100/api/records/') + + data_directory = getattr(settings, 'JMIG_DATA') + data_url = getattr(settings, 'JMIG_DATA_URL', + 'localhost:8100/api/records/') es_index = getattr(settings, 'JMIG_INDEX', 'jewishmigration') image = 'jewish_inscriptions.jpg' @@ -56,12 +59,15 @@ class JewishMigration(PeacePortal, JSONCorpusDefinition): category = 'inscription' def sources(self, start, end): - if self.data_directory.startswith('http'): - response = requests.get(self.data_directory) + if self.data_url: + response = requests.get(self.data_url) list_of_sources = response.json() - else: + elif self.data_directory: with open(self.data_directory, 'r') as f: list_of_sources = json.load(f) + else: + logging.getLogger('indexing').warning( + 'No data directory or URL provided.') for source in list_of_sources: yield source diff --git a/backend/corpora/jewishmigration/test_jewishmigration.py b/backend/corpora/jewishmigration/test_jewishmigration.py index 635081802..cec55f18a 100644 --- a/backend/corpora/jewishmigration/test_jewishmigration.py +++ b/backend/corpora/jewishmigration/test_jewishmigration.py @@ -137,6 +137,7 @@ def jm_corpus_settings(settings): 'jewishmigration': os.path.join(here, 'jewishmigration.py') } settings.JMIG_DATA = None + settings.JMIG_DATA_URL = 'http://www.example.com' settings.JMIG_INDEX = 'test-jewishmigration' diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py index f877e7fbc..970cd91a3 100644 --- a/backend/es/tests/test_es_index.py +++ b/backend/es/tests/test_es_index.py @@ -67,6 +67,6 @@ def test_db_only_corpus(json_mock_corpus, es_client, test_index_cleanup): perform_indexing( corpus=json_mock_corpus, ) - sleep(1) + sleep(2) res = es_client.count(index=json_mock_corpus.configuration.es_index) assert res.get('count') == 10 From 9cebf0fc5cd5d7933599789b04520632c7769d55 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 7 May 2024 14:32:05 +0200 Subject: [PATCH 46/94] make CorpusEditSerializer --- backend/addcorpus/serializers.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/backend/addcorpus/serializers.py b/backend/addcorpus/serializers.py index f2c92626c..7539973f4 100644 --- a/backend/addcorpus/serializers.py +++ b/backend/addcorpus/serializers.py @@ -1,8 +1,13 @@ from rest_framework import serializers +from typing import Dict + from addcorpus.models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage from addcorpus.constants import CATEGORIES from langcodes import Language, standardize_tag from addcorpus.documentation import render_documentation_context +from addcorpus.json_corpora.export_json import export_json_corpus +from addcorpus.json_corpora.import_json import import_json_corpus + class NonEmptyJSONField(serializers.JSONField): ''' @@ -121,3 +126,14 @@ class CorpusDocumentationPageSerializer(serializers.ModelSerializer): class Meta: model = CorpusDocumentationPage fields = ['corpus_configuration', 'type', 'content'] + + +class CorpusEditSerializer(serializers.ModelSerializer): + class Meta: + model = Corpus + + def to_representation(self, instance) -> Dict: + return export_json_corpus(instance) + + def to_internal_value(self, data): + return import_json_corpus(data) From 3b7e59112a7550bd73afa2ce4217399efddc81aa Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 7 May 2024 15:17:38 +0200 Subject: [PATCH 47/94] update import json to return validated data dict --- backend/addcorpus/json_corpora/import_json.py | 282 +++++++++--------- .../json_corpora/tests/test_import.py | 28 +- 2 files changed, 155 insertions(+), 155 deletions(-) diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py index 9b6a68747..a85eb5aa2 100644 --- a/backend/addcorpus/json_corpora/import_json.py +++ b/backend/addcorpus/json_corpora/import_json.py @@ -1,4 +1,4 @@ -from typing import Dict, Iterable, Optional +from typing import List, Dict, Iterable, Optional from datetime import datetime @@ -10,22 +10,13 @@ from django.conf import settings from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT -def import_json_corpus(data: Dict) -> Corpus: +def import_json_corpus(data: Dict) -> Dict: name = get_path(data, 'name') - corpus, _created = Corpus.objects.get_or_create(name=name) - - # create a clean CorpusConfiguration object, but use the existing PK if possible - pk = corpus.configuration_obj.pk if corpus.configuration_obj else None - configuration = CorpusConfiguration(pk=pk, corpus=corpus) - configuration = _parse_configuration(data, configuration) - configuration.save() - configuration.full_clean() - - _import_fields(data, configuration) - - return corpus - + return { + 'name': name, + 'configuration': _parse_configuration(data) + } def create_index_name(corpus_name: str) -> str: prefix = settings.SERVERS['default'].get('index_prefix', None) @@ -34,71 +25,57 @@ def create_index_name(corpus_name: str) -> str: return corpus_name -def _parse_configuration(data: Dict, configuration: CorpusConfiguration) -> CorpusConfiguration: - configuration.title = get_path(data, 'meta', 'title') - configuration.description = get_path(data, 'meta', 'description') - configuration.category = get_path(data, 'meta', 'category') - configuration.es_index = create_index_name(get_path(data, 'name')) - configuration.languages = get_path(data, 'meta', 'languages') - configuration.min_date = _parse_date( - get_path(data, 'meta', 'date_range', 'min')) - configuration.max_date = _parse_date( - get_path(data, 'meta', 'date_range', 'max')) - configuration.default_sort = get_path( - data, 'options', 'default_sort') or {} - configuration.language_field = get_path( - data, 'options', 'language_field') or '' - configuration.document_context = get_path( - data, 'options', 'document_context') or {} - configuration.source_data_delimiter = get_path( - data, 'source_data', 'options', 'delimiter') or DEFAULT_CSV_DELIMITER - return configuration +def _parse_configuration(data: Dict) -> Dict: + return { + 'title': get_path(data, 'meta', 'title'), + 'description': get_path(data, 'meta', 'description'), + 'category': get_path(data, 'meta', 'category'), + 'es_index': create_index_name(get_path(data, 'name')), + 'languages': get_path(data, 'meta', 'languages'), + 'min_date': _parse_date( + get_path(data, 'meta', 'date_range', 'min')), + 'max_date': _parse_date( + get_path(data, 'meta', 'date_range', 'max')), + 'default_sort': get_path( + data, 'options', 'default_sort') or {}, + 'language_field': get_path( + data, 'options', 'language_field') or '', + 'document_context': get_path( + data, 'options', 'document_context') or {}, + 'source_data_delimiter': get_path( + data, 'source_data', 'options', 'delimiter') or DEFAULT_CSV_DELIMITER, + 'fields': _import_fields(data), + } def _parse_date(date: str): return datetime.strptime(date, DATE_FORMAT).date() -def _import_fields(data: Dict, configuration: CorpusConfiguration) -> None: +def _import_fields(data: Dict) -> List[Dict]: fields_data = get_path(data, 'fields') - for field_data in fields_data: - field = _parse_field(field_data, configuration) - field.save() - field.full_clean() + parsed = [_parse_field(field) for field in fields_data] - for field in configuration.fields.exclude(name__in=(f['name'] for f in fields_data)): - field.delete() + # TODO: replace this!!!! + # for field in configuration.fields.exclude(name__in=(f['name'] for f in fields_data)): + # field.delete() - _include_ngram_visualisation(configuration.fields.all()) + _include_ngram_visualisation(parsed) + return parsed -def _field_pk(name: str, configuration: CorpusConfiguration): - try: - return Field.objects.get(corpus_configuration=configuration, name=name).pk - except Field.DoesNotExist: - return None - - -def _parse_field(field_data: Dict, configuration: Optional[CorpusConfiguration] = None) -> Field: - name = get_path(field_data, 'name') - display_name = get_path(field_data, 'display_name') - description = get_path(field_data, 'description') +def _parse_field(field_data: Dict) -> Dict: results_overview = get_path(field_data, 'options', 'preview') - hidden = get_path(field_data, 'options', 'hidden') - extract_column = get_path(field_data, 'extract', 'column') - - field = Field( - pk=_field_pk(name, configuration) if configuration else None, - corpus_configuration=configuration, - name=name, - display_name=display_name, - description=description, - results_overview=results_overview, - hidden=hidden, - csv_core=results_overview, - extract_column=extract_column, - ) + parsed = { + 'name': get_path(field_data, 'name'), + 'display_name': get_path(field_data, 'display_name'), + 'description': get_path(field_data, 'description'), + 'results_overview': results_overview, + 'hidden': get_path(field_data, 'options', 'hidden'), + 'extract_column': get_path(field_data, 'extract', 'column'), + 'csv_core': results_overview, + } field_type = get_path(field_data, 'type') parsers = { @@ -111,171 +88,185 @@ def _parse_field(field_data: Dict, configuration: Optional[CorpusConfiguration] 'boolean': _parse_boolean_field, 'geo_point': _parse_geo_field, } - field = parsers[field_type](field, field_data) - - return field - + type_specific_data = parsers[field_type](field_data) + parsed.update(type_specific_data) + return parsed -def _parse_text_content_field(field: Field, field_data: Dict) -> Field: +def _parse_text_content_field(field_data: Dict) -> Field: language = _parse_language(field_data) has_single_language = language and language != 'dynamic' - field.es_mapping = es_mappings.main_content_mapping( - token_counts=True, - stopword_analysis=has_single_language, - stemming_analysis=has_single_language, - language=language if has_single_language else None, - ) - field.language = language - field.display_type = 'text_content' - field.search_filter = {} - field.searchable = True - field.search_field_core = True + parsed = { + 'es_mapping': es_mappings.main_content_mapping( + token_counts=True, + stopword_analysis=has_single_language, + stemming_analysis=has_single_language, + language=language if has_single_language else None, + ), + 'language': language, + 'display_type': 'text_content', + 'search_filter': {}, + 'searchable': True, + 'search_field_core': True, + } visualize = get_path(field_data, 'options', 'visualize') if visualize: - field.visualizations = [ + parsed['visualizations'] = [ VisualizationType.WORDCLOUD.value ] - return field + return parsed -def _parse_text_metadata_field(field: Field, field_data: Dict) -> Field: +def _parse_text_metadata_field(field_data: Dict) -> Dict: searchable = get_path(field_data, 'options', 'search') filter_setting = get_path(field_data, 'options', 'filter') filterable = filter_setting != 'none' sortable = get_path(field_data, 'options', 'sort') visualize = get_path(field_data, 'options', 'visualize') - field.language = _parse_language(field_data) + parsed = { + 'language': _parse_language(field_data) + } if searchable and not (sortable or filterable): - field.es_mapping = es_mappings.text_mapping() - field.display_type = 'text' - field.search_filter = {} - field.searchable = True + parsed['es_mapping'] = es_mappings.text_mapping() + parsed['display_type'] = 'text' + parsed['search_filter'] = {} + parsed['searchable'] = True if visualize: - field.visualizations = [ + parsed['visualizations'] = [ VisualizationType.WORDCLOUD.value ] else: - field.es_mapping = es_mappings.keyword_mapping( + parsed['es_mapping'] = es_mappings.keyword_mapping( enable_full_text_search=searchable ) - field.display_type = 'keyword' + parsed['display_type'] = 'keyword' if filter_setting == 'show': - field.search_filter = { + parsed['search_filter'] = { 'name': 'MultipleChoiceFilter', - 'description': f'Select results based on {field.display_name}', + 'description': f'Select results based on {field_data["display_name"]}', } else: - field.search_filter = {} - field.searchable = searchable - field.sortable = sortable + parsed['search_filter'] = {} + parsed['searchable'] = searchable + parsed['sortable'] = sortable if visualize: - field.visualizations = [ + parsed['visualizations'] = [ VisualizationType.RESULTS_COUNT.value, VisualizationType.TERM_FREQUENCY.value, ] - return field + return parsed def _parse_language(field_data: Dict) -> str: return get_path(field_data, 'language') or '' -def _parse_url_field(field: Field, field_data: Dict) -> Field: - field.es_mapping = es_mappings.keyword_mapping() - field.display_type = 'url' - field.search_filter = {} - return field - +def _parse_url_field(_field_data: Dict) -> Dict: + parsed = { + 'es_mapping': es_mappings.keyword_mapping(), + 'display_type': 'url', + 'search_filter': {}, + } + return parsed -def _parse_numeric_field(field: Field, field_data: Dict) -> Field: - field.display_type = get_path(field_data, 'type') +def _parse_numeric_field(field_data: Dict) -> Dict: + parsed = { + 'display_type': get_path(field_data, 'type') + } - if field.display_type == 'integer': - field.es_mapping = es_mappings.int_mapping() + if parsed['display_type'] == 'integer': + parsed['es_mapping'] = es_mappings.int_mapping() else: - field.es_mapping = es_mappings.float_mapping() + parsed['es_mapping'] = es_mappings.float_mapping() - field.sortable = get_path(field_data, 'options', 'sort') + parsed['sortable'] = get_path(field_data, 'options', 'sort') filter_setting = get_path(field_data, 'options', 'filter') if filter_setting == 'show': - field.search_filter = { + parsed['search_filter'] = { 'name': 'RangeFilter', - 'description': f'Select results based on {field.display_name}', + 'description': f'Select results based on {field_data["display_name"]}', } else: - field.search_filter = {} + parsed['search_filter'] = {} visualize = get_path(field_data, 'options', 'visualize') if visualize: - field.visualizations = [ + parsed['visualizations'] = [ VisualizationType.RESULTS_COUNT.value, VisualizationType.TERM_FREQUENCY.value ] - field.visualization_sort = 'key' - return field + parsed['visualization_sort'] = 'key' + return parsed -def _parse_date_field(field: Field, field_data: Dict) -> Field: - field.display_type = 'date' - field.es_mapping = es_mappings.date_mapping() - field.sortable = get_path(field_data, 'options', 'sort') +def _parse_date_field(field_data: Dict) -> Dict: filter_setting = get_path(field_data, 'options', 'filter') + parsed = { + 'display_type': 'date', + 'es_mapping': es_mappings.date_mapping(), + 'sortable': get_path(field_data, 'options', 'sort'), + + } + if filter_setting == 'show': - field.search_filter = { + parsed['search_filter'] = { 'name': 'DateFilter', - 'description': f'Select results based on {field.display_name}', + 'description': f'Select results based on {field_data["display_name"]}', } else: - field.search_filter = {} + parsed['search_filter'] = {} visualize = get_path(field_data, 'options', 'visualize') if visualize: - field.visualizations = [ + parsed['visualizations'] = [ VisualizationType.RESULTS_COUNT.value, VisualizationType.TERM_FREQUENCY.value ] - return field + return parsed -def _parse_boolean_field(field: Field, field_data: Dict) -> Field: - field.display_type = 'boolean' - field.es_mapping = es_mappings.bool_mapping() +def _parse_boolean_field(field_data: Dict) -> Dict: + parsed = { + 'display_type': 'boolean', + 'es_mapping': es_mappings.bool_mapping(), + } + filter_setting = get_path(field_data, 'options', 'filter') if filter_setting == 'show': - field.search_filter = { + parsed['search_filter'] = { 'name': 'BooleanFilter', - 'description': f'Select results based on {field.display_name}', + 'description': f'Select results based on {field_data["display_name"]}', } else: - field.search_filter = {} + parsed['search_filter'] = {} visualize = get_path(field_data, 'options', 'visualize') if visualize: - field.visualizations = [ + parsed['visualizations'] = [ VisualizationType.RESULTS_COUNT.value, VisualizationType.TERM_FREQUENCY.value ] - return field + return parsed -def _parse_geo_field(field: Field, field_data: Dict) -> Field: - field.display_type = 'geo_point' - field.es_mapping = es_mappings.geo_mapping() - field.search_filter = {} - return field +def _parse_geo_field(field_data: Dict) -> Dict: + return { + 'display_type': 'geo_point', + 'es_mapping': es_mappings.geo_mapping(), + 'search_filter': {}, + } -def _include_ngram_visualisation(fields: Iterable[Field]): +def _include_ngram_visualisation(fields: Iterable[Dict]) -> None: ''' Check if the ngram visualisation can be included and add it if possible. @@ -283,8 +274,7 @@ def _include_ngram_visualisation(fields: Iterable[Field]): field is present. ''' - if _any_date_fields(fields): + if any(get_path(field, 'es_mapping', 'type') == 'date' for field in fields): for field in fields: - if field.display_type == 'text_content': - field.visualizations.append(VisualizationType.NGRAM.value) - field.save() + if field['display_type'] == 'text_content': + field['visualizations'].append(VisualizationType.NGRAM.value) diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 64ddeaf8b..835d805cc 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -1,9 +1,12 @@ from datetime import date from addcorpus.json_corpora.import_json import import_json_corpus, _parse_field - +from addcorpus.models import Corpus, Field def test_import(db, json_corpus_data): - corpus = import_json_corpus(json_corpus_data) + data = import_json_corpus(json_corpus_data) + corpus = Corpus(**data) + corpus.save() + corpus.full_clean() assert corpus.name == 'example' assert corpus.ready_to_index() @@ -31,7 +34,8 @@ def test_import(db, json_corpus_data): def test_parse_content_field(content_field_json): - field = _parse_field(content_field_json) + data = _parse_field(content_field_json) + field = Field(**data) assert field.name == 'content' assert field.display_name == 'Content' assert field.display_type == 'text_content' @@ -52,7 +56,8 @@ def test_parse_content_field(content_field_json): def test_parse_keyword_field(keyword_field_json): - field = _parse_field(keyword_field_json) + data = _parse_field(keyword_field_json) + field = Field(**data) assert field.name == 'author' assert field.display_type == 'keyword' assert field.search_filter['name'] == 'MultipleChoiceFilter' @@ -67,7 +72,8 @@ def test_parse_keyword_field(keyword_field_json): def test_parse_int_field(int_field_json): - field = _parse_field(int_field_json) + data = _parse_field(int_field_json) + field = Field(**data) assert field.name == 'year' assert field.display_type == 'integer' assert field.search_filter['name'] == 'RangeFilter' @@ -83,7 +89,8 @@ def test_parse_int_field(int_field_json): def test_parse_float_field(float_field_json): - field = _parse_field(float_field_json) + data = _parse_field(float_field_json) + field = Field(**data) assert field.name == 'ocr_confidence' assert field.display_type == 'float' assert field.search_filter == {} @@ -99,7 +106,8 @@ def test_parse_float_field(float_field_json): def test_parse_date_field(date_field_json): - field = _parse_field(date_field_json) + data = _parse_field(date_field_json) + field = Field(**data) assert field.name == 'date' assert field.display_type == 'date' assert field.search_filter['name'] == 'DateFilter' @@ -114,7 +122,8 @@ def test_parse_date_field(date_field_json): def test_parse_boolean_field(boolean_field_json): - field = _parse_field(boolean_field_json) + data = _parse_field(boolean_field_json) + field = Field(**data) assert field.name == 'author_known' assert field.display_type == 'boolean' assert field.search_filter['name'] == 'BooleanFilter' @@ -129,7 +138,8 @@ def test_parse_boolean_field(boolean_field_json): def test_parse_geo_field(geo_field_json): - field = _parse_field(geo_field_json) + data = _parse_field(geo_field_json) + field = Field(**data) assert field.name == 'location' assert field.display_type == 'geo_point' assert field.search_filter == {} From 10dfb9b2093d1c9b2e0dd21df5f92899e7bc55fd Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 7 May 2024 17:08:00 +0200 Subject: [PATCH 48/94] create + update methods on CorpusEditSerializer --- .../json_corpora/tests/test_import.py | 23 ++++++++--- backend/addcorpus/serializers.py | 38 ++++++++++++++++++- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 835d805cc..d87dfe887 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -1,12 +1,13 @@ from datetime import date -from addcorpus.json_corpora.import_json import import_json_corpus, _parse_field -from addcorpus.models import Corpus, Field +from addcorpus.json_corpora.import_json import _parse_field +from addcorpus.models import Field +from addcorpus.serializers import CorpusEditSerializer + def test_import(db, json_corpus_data): - data = import_json_corpus(json_corpus_data) - corpus = Corpus(**data) - corpus.save() - corpus.full_clean() + serializer = CorpusEditSerializer(data=json_corpus_data) + assert serializer.is_valid() + corpus = serializer.create(serializer.validated_data) assert corpus.name == 'example' assert corpus.ready_to_index() @@ -33,6 +34,16 @@ def test_import(db, json_corpus_data): assert line_field.display_type == 'text_content' +def test_serializer_representation(db, json_corpus_data): + serializer = CorpusEditSerializer(data=json_corpus_data) + assert serializer.is_valid() + corpus = serializer.create(serializer.validated_data) + + serialized = serializer.to_representation(corpus) + + assert json_corpus_data == serialized + + def test_parse_content_field(content_field_json): data = _parse_field(content_field_json) field = Field(**data) diff --git a/backend/addcorpus/serializers.py b/backend/addcorpus/serializers.py index 7539973f4..67296f8f9 100644 --- a/backend/addcorpus/serializers.py +++ b/backend/addcorpus/serializers.py @@ -131,9 +131,45 @@ class Meta: class CorpusEditSerializer(serializers.ModelSerializer): class Meta: model = Corpus + fields = '__all__' def to_representation(self, instance) -> Dict: return export_json_corpus(instance) - def to_internal_value(self, data): + def to_internal_value(self, data) -> Dict: return import_json_corpus(data) + + def create(self, validated_data: Dict): + configuration_data = validated_data.pop('configuration') + fields_data = configuration_data.pop('fields') + + corpus = Corpus.objects.create(**validated_data) + configuration = CorpusConfiguration.objects.create(corpus=corpus, **configuration_data) + for field_data in fields_data: + Field.objects.create(corpus_configuration=configuration, **field_data) + + return corpus + + def update(self, instance: Corpus, validated_data: Dict): + configuration_data = validated_data.pop('configuration') + fields_data = configuration_data.pop('fields') + + corpus = Corpus(pk=instance.pk, **validated_data) + corpus.save() + + configuration, _ = CorpusConfiguration.objects.get_or_create(corpus=corpus) + for attr in validated_data: + setattr(configuration, attr, validated_data[attr]) + configuration.save() + + for field_data in fields_data: + field, _ = Field.objects.get_or_create( + corpus_configuration=configuration, name=field_data['name'] + ) + for attr in field_data: + setattr(field, attr, field_data[attr]) + field.save() + + configuration.fields.exclude(name__in=(f['name'] for f in fields_data)).delete() + + return corpus From 992ed405bb586fe8e62456fc281b64c9ea5f37b4 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 8 May 2024 12:20:24 +0200 Subject: [PATCH 49/94] update export_json tests --- backend/addcorpus/json_corpora/tests/test_export.py | 5 +++-- backend/conftest.py | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/backend/addcorpus/json_corpora/tests/test_export.py b/backend/addcorpus/json_corpora/tests/test_export.py index 5e4640d6a..b1e046c9d 100644 --- a/backend/addcorpus/json_corpora/tests/test_export.py +++ b/backend/addcorpus/json_corpora/tests/test_export.py @@ -1,5 +1,5 @@ from addcorpus.json_corpora.export_json import export_json_corpus, export_json_field -from addcorpus.models import Corpus +from addcorpus.models import Corpus, Field from addcorpus.json_corpora.import_json import _parse_field def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data): @@ -8,5 +8,6 @@ def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data): def test_field_export(any_field_json): imported = _parse_field(any_field_json) - exported = export_json_field(imported) + field = Field(**imported) + exported = export_json_field(field) assert any_field_json == exported diff --git a/backend/conftest.py b/backend/conftest.py index e128a214f..602ce741b 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -14,7 +14,7 @@ from django.conf import settings from django.contrib.auth.models import Group from addcorpus.models import Corpus - +from addcorpus.serializers import CorpusEditSerializer @pytest.fixture(autouse=True) def media_dir(tmpdir, settings): @@ -201,6 +201,9 @@ def json_corpus_data(): @pytest.fixture(autouse=True) -def json_mock_corpus(db, json_corpus_data): +def json_mock_corpus(db, json_corpus_data) -> Corpus: # add json mock corpora to the database at the start of each test - return import_json_corpus(json_corpus_data) + serializer = CorpusEditSerializer(data=json_corpus_data) + assert serializer.is_valid() + corpus = serializer.create(serializer.validated_data) + return corpus From dd189654b81f4e26321ab770a55e180be4f53fc7 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 8 May 2024 12:22:16 +0200 Subject: [PATCH 50/94] clear corpora before testing import --- backend/addcorpus/json_corpora/tests/test_import.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index d87dfe887..8ef362b46 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -1,10 +1,12 @@ from datetime import date from addcorpus.json_corpora.import_json import _parse_field -from addcorpus.models import Field +from addcorpus.models import Field, Corpus from addcorpus.serializers import CorpusEditSerializer -def test_import(db, json_corpus_data): +def test_json_corpus_import(db, json_corpus_data): + Corpus.objects.all().delete() + serializer = CorpusEditSerializer(data=json_corpus_data) assert serializer.is_valid() corpus = serializer.create(serializer.validated_data) @@ -35,6 +37,8 @@ def test_import(db, json_corpus_data): def test_serializer_representation(db, json_corpus_data): + Corpus.objects.all().delete() + serializer = CorpusEditSerializer(data=json_corpus_data) assert serializer.is_valid() corpus = serializer.create(serializer.validated_data) From d6f632937aa51bd2afc615298d35e65e461bb931 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 8 May 2024 12:34:24 +0200 Subject: [PATCH 51/94] fix update serializer method --- backend/addcorpus/json_corpora/export_json.py | 2 +- .../addcorpus/json_corpora/tests/test_export.py | 1 + .../addcorpus/json_corpora/tests/test_import.py | 14 ++++++++++++-- backend/addcorpus/serializers.py | 4 ++-- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/backend/addcorpus/json_corpora/export_json.py b/backend/addcorpus/json_corpora/export_json.py index 5178590fc..6ba9c130a 100644 --- a/backend/addcorpus/json_corpora/export_json.py +++ b/backend/addcorpus/json_corpora/export_json.py @@ -6,7 +6,7 @@ def export_json_corpus(corpus: Corpus) -> Dict: config = corpus.configuration - data = {'name': corpus.name} + data = {'name': corpus.name, 'id': corpus.pk } data['meta'] = export_corpus_meta(config) data['source_data'] = export_corpus_source_data(config) options = export_corpus_options(config) diff --git a/backend/addcorpus/json_corpora/tests/test_export.py b/backend/addcorpus/json_corpora/tests/test_export.py index b1e046c9d..b1ee9f691 100644 --- a/backend/addcorpus/json_corpora/tests/test_export.py +++ b/backend/addcorpus/json_corpora/tests/test_export.py @@ -4,6 +4,7 @@ def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data): result = export_json_corpus(json_mock_corpus) + result.pop('id') assert result == json_corpus_data def test_field_export(any_field_json): diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 8ef362b46..1430ea931 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -2,7 +2,7 @@ from addcorpus.json_corpora.import_json import _parse_field from addcorpus.models import Field, Corpus from addcorpus.serializers import CorpusEditSerializer - +from addcorpus.models import Corpus, CorpusConfiguration def test_json_corpus_import(db, json_corpus_data): Corpus.objects.all().delete() @@ -44,9 +44,19 @@ def test_serializer_representation(db, json_corpus_data): corpus = serializer.create(serializer.validated_data) serialized = serializer.to_representation(corpus) - + serialized.pop('id') assert json_corpus_data == serialized +def test_serializer_update(db, json_corpus_data, json_mock_corpus: Corpus): + json_corpus_data['meta']['description'] = 'A different description' + serializer = CorpusEditSerializer(data=json_corpus_data) + assert serializer.is_valid() + serializer.update(json_mock_corpus, serializer.validated_data) + + corpus_config = CorpusConfiguration.objects.get(corpus=json_mock_corpus) + assert corpus_config.description == 'A different description' + + def test_parse_content_field(content_field_json): data = _parse_field(content_field_json) diff --git a/backend/addcorpus/serializers.py b/backend/addcorpus/serializers.py index 67296f8f9..ca1fae546 100644 --- a/backend/addcorpus/serializers.py +++ b/backend/addcorpus/serializers.py @@ -158,8 +158,8 @@ def update(self, instance: Corpus, validated_data: Dict): corpus.save() configuration, _ = CorpusConfiguration.objects.get_or_create(corpus=corpus) - for attr in validated_data: - setattr(configuration, attr, validated_data[attr]) + for attr in configuration_data: + setattr(configuration, attr, configuration_data[attr]) configuration.save() for field_data in fields_data: From b637b0461be99d66c2c704239cca5022bfb7e0d6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 8 May 2024 12:38:20 +0200 Subject: [PATCH 52/94] add test for removing field --- backend/addcorpus/json_corpora/tests/test_import.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 1430ea931..1004ff33e 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -48,14 +48,21 @@ def test_serializer_representation(db, json_corpus_data): assert json_corpus_data == serialized def test_serializer_update(db, json_corpus_data, json_mock_corpus: Corpus): + # edit description json_corpus_data['meta']['description'] = 'A different description' serializer = CorpusEditSerializer(data=json_corpus_data) assert serializer.is_valid() serializer.update(json_mock_corpus, serializer.validated_data) - corpus_config = CorpusConfiguration.objects.get(corpus=json_mock_corpus) assert corpus_config.description == 'A different description' + # remove a field + assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 2 + json_corpus_data['fields'] = json_corpus_data['fields'][:-1] + serializer = CorpusEditSerializer(data=json_corpus_data) + assert serializer.is_valid() + serializer.update(json_mock_corpus, serializer.validated_data) + assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 1 def test_parse_content_field(content_field_json): From 3894a37c3f24cdd3484a49dd8ebdf61e3c282812 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 8 May 2024 13:39:14 +0200 Subject: [PATCH 53/94] add corpus edit viewset to API --- backend/addcorpus/tests/test_corpus_views.py | 21 ++++++++++++++++++++ backend/addcorpus/views.py | 12 +++++++++-- backend/ianalyzer/urls.py | 2 ++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index 525ec021c..913b4e8f4 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -1,4 +1,7 @@ from rest_framework import status +from django.test.client import Client +from typing import Dict + from users.models import CustomUser from addcorpus.models import Corpus from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora @@ -80,3 +83,21 @@ def test_corpus_not_publication_ready(admin_client, basic_mock_corpus): response = admin_client.get('/api/corpus/') corpus = not any(c['name'] == basic_mock_corpus for c in response.data) + +def test_corpus_edit_views(admin_client: Client, json_corpus_data: Dict, json_mock_corpus: Corpus): + json_mock_corpus.delete() + + response = admin_client.get('/api/corpus/edit/') + assert status.is_success(response.status_code) + assert len(response.data) == 0 + + response = admin_client.post( + '/api/corpus/edit/', + json_corpus_data, + content_type='application/json', + ) + assert status.is_success(response.status_code) + + response = admin_client.get('/api/corpus/edit/') + assert status.is_success(response.status_code) + assert len(response.data) == 1 diff --git a/backend/addcorpus/views.py b/backend/addcorpus/views.py index 3cf5ec94c..5d70939ac 100644 --- a/backend/addcorpus/views.py +++ b/backend/addcorpus/views.py @@ -1,10 +1,10 @@ from rest_framework.views import APIView -from addcorpus.serializers import CorpusSerializer, CorpusDocumentationPageSerializer +from addcorpus.serializers import CorpusSerializer, CorpusDocumentationPageSerializer, CorpusEditSerializer from rest_framework.response import Response from addcorpus.python_corpora.load_corpus import corpus_dir, load_corpus_definition import os from django.http.response import FileResponse -from rest_framework.permissions import IsAuthenticatedOrReadOnly +from rest_framework.permissions import IsAuthenticatedOrReadOnly, IsAdminUser from addcorpus.permissions import CorpusAccessPermission, filter_user_corpora from rest_framework.exceptions import NotFound from rest_framework import viewsets @@ -86,3 +86,11 @@ class CorpusDocumentView(APIView): def get(self, request, *args, **kwargs): return send_corpus_file(subdir='documents', **kwargs) + + +class CorpusEditViewset(viewsets.ModelViewSet): + permission_classes = [IsAdminUser] + serializer_class = CorpusEditSerializer + + def get_queryset(self): + return Corpus.objects.filter(has_python_definition=False) diff --git a/backend/ianalyzer/urls.py b/backend/ianalyzer/urls.py index dec20bc80..3faf7dd5a 100644 --- a/backend/ianalyzer/urls.py +++ b/backend/ianalyzer/urls.py @@ -33,10 +33,12 @@ from media import urls as media_urls from tag import urls as tag_urls from tag.views import TagViewSet +from addcorpus.views import CorpusEditViewset api_router = routers.DefaultRouter() # register viewsets with this router api_router.register('search_history', QueryViewset, basename='query') api_router.register('tag/tags', TagViewSet) +api_router.register('corpus/edit', CorpusEditViewset, basename='corpus') if settings.PROXY_FRONTEND: spa_url = re_path(r'^(?P.*)$', proxy_frontend) From 4025652b00bb21b27702d3a76903e4245c95dcd8 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 14 May 2024 16:16:52 +0200 Subject: [PATCH 54/94] update corpus definitions documentation --- documentation/Corpus-definitions.md | 69 +++++++++++++++++------------ 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/documentation/Corpus-definitions.md b/documentation/Corpus-definitions.md index 2dd96d0fb..ec246792b 100644 --- a/documentation/Corpus-definitions.md +++ b/documentation/Corpus-definitions.md @@ -2,55 +2,68 @@ Corpus definitions are the way that we configure each corpus in I-analyzer. -This documents gives a basic explanation of how corpus definitions "work" in the backend. For a more elaborate description of how to add corpus definition and what properties they contain, see [How to add a new corpus to I-analyzer](/documentation/How-to-add-a-new-corpus-to-Ianalyzer.md). +This documents gives a basic explanation of how corpus definitions "work" in the backend. It introduces the core concepts and mechanics. -## Definition +## Corpus definitions -At the moment, I-analyzer supports one method to define a corpus: by writing a custom Python class and including it in the backend. +Each corpus is defined by a collection of metadata. This describes things like: -We plan to support adding corpora through the interface; as we're working on this, some modules may suggest this possibility. +- basic information to display in the interface +- what fields the corpus contains, and how they should be configured +- support for optional functionality like word models or media attachments +- where and how to extract data from source files -### Python class +Note that a corpus definition does not include the actual data (i.e. documents), it just tells you where to find it. Reading the source data and loading it into the database is a separate action called *indexing*. + +Corpora can be created in two ways: + +- a **Python corpus** is defined in a Python module. Most data from this module is loaded into the databse, but the module also implements custom functions for complex functionality, such as data extraction. +- a **database-only corpus** is only represented in the database and does not use any custom Python functions. It offers less customisation, but is easier to create. -Python-based corpora are written as Python classes. Each definition is a subclass of `CorpusDefinition`. +> [!NOTE] +> Database-only corpora are a new feature that is still in development. This option is not yet recommended for production. -All corpus classes are contained in the [corpora](/backend/corpora/) directory (though it is technically possible to include definitions from elsewhere in the file system). This directory is not a Django app, but just a collection of scripts and metadata. +The sections below give a a more detailed overview of the differences between these options. Per option, there is also a more detailed description of how it works. -To be loaded into the application, the definition needs to be added to Django settings. The project includes a `CORPORA` setting which defines a mapping of names and python files, and lists which definitions should be loaded. +## Python vs. database-only corpora -On startup, all configured python classes will be loaded into the database. During much of the runtime, the backend will refer to the database model rather than the python class. However, the python class is imported for more advanced features like document scans and word models. It is also used during indexing. +These are the key differences between Python and database-only corpora. -## Database models +### Data extraction -Python definitions can be loaded into the database with the `loadcorpora` command in the backend. Normally, this is run when you start the server. +A Python corpus can theoretically extract data from any format. In practice, we rely on the [ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/) package which provides extraction utilities for common file types like CSV and XML, but the methods for extraction can as complex as you want. The design philosophy is that you can use the original format of a dataset as the source data for I-analyzer, without any pre-processing. -This command will parse any configured python corpora and save a `Corpus` and `CorpusConfiguration` object for them. If the python corpus cannot be loaded, the `Corpus` object will still exist in the database, it will be inactive. +A database-only corpus only supports CSV extraction with very little room for customisation. Here, the idea is that you pre-process your data *before* you pass it on to I-analyzer. If it is convenient, you can use the `ianalyzer_readers` package to do so. -### Corpus vs. CorpusConfiguration +### Customisation of the interface -The `CorpusConfiguration` model (and its related model `Field`) contains anything coming from the corpus definition class. It has a one-to-one relationship with `Corpus`. +Generally speaking, Python corpora support more customisation of the interface, while the process for entering database-only corpora is designed to infer a lot of these options. -The primary distinction is that `CorpusConfiguration` does not need to be preserved when you import corpus definitions: this model should be completely determined by the definition file, so it is overwritten each time. +For example, if you want the search interface to show a filter for a field in the corpus, a Python corpus requires that you enter a custom description for the filter, but if you use the API for database-only corpora, the description will be auto-generated. -`Corpus` is intended as a stable object that will be preserved when loading corpus definitions. This allows it to function as a reference point for search history, permissions, et cetera. +This means that some customisation is only available for Python corpora. However, it also means that database-only corpora can offer a more streamlined and accessible process for creating a corpus definition. -### The django admin +### Advanced functionality + +Database-only corpora do not support some advanced functionality. Notably: +- word models (i.e. word embeddings) +- media attachments to documents +- updating the data of a corpus instead of re-indexing it from scratch + +### Python class -The Django admin interface is enabled for `CorpusConfiguration` and `Field`, mostly for the sake of providing an overview to developers. While it is possible to adjust settings here, they will be overwritten the next time you import corpus definitions. +Python-based corpora are written as Python classes. Each definition should be a subclass of `CorpusDefinition`. -### Hiding and deleting corpora +The [corpora](/backend/corpora/) directory contains definitions for all corpora we create. (On top of that, [corpora_test](/backend/corpora_test/) defines corpora for for unit tests. Corpora *can* be saved anywhere.) This directory is not a Django app, but just a collection of scripts and metadata. -If you want to remove a corpus from your environment, remove it from the Django settings. +To be imported into the application, a definition needs to be added in the Django project settings. The `CORPORA` setting defines a mapping of names and python files, which declares what definitions should be loaded. -Removing a corpus from the settings will not delete the `Corpus` object. It has the following effect: +When you start up a server, all configured corpus definitions will be imported into the database. During much of the runtime, the backend will refer to the database model rather than the Python class. However, this class can be loaded for more advanced features where custom functions may be used. The most common situation where this happens is when you index the source data. -- The properties `corpus.active` and `corpus.has_python_definition` are set to `False`. -- The methods `corpus.ready_to_index()` and `corpus.read_to_publish()` will return `False`. -- The corpus will be hidden from the API and interface -- The python definition will no longer be imported during startup +While Python corpora are *represented* in the database, the source code is still seen as the ultimate source of truth. Each time you start up the server, the corpus is imported again, and this overwrites any changes that that were made to the database in the meantime. -Since the underlying `Corpus` is not actually deleted, related search history, downloads, tags, and permissions will be preserved. If you reinstate the corpus in settings, all of these will function as before. +## Database-only corpora -At this point, you can also remove the `Corpus` object completely, which will remove all related data. +Database-only corpora are just database objects, so unlike Python corpora, they have no single method to be created. -Note: if you want to temporarily hide a corpus in a production environment, it may be easier to adjust permissions in the Django admin, rather than adjust settings. +However, database-only corpora have a JSON API that is the recommended way of entering corpora. You can write a corpus definition as a JSON and import it through the API. In the future, we also plan to have a form that connects to this API. From d34a21a1770231667ba2b6f6889e6c385e90a8bb Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 14 May 2024 16:29:05 +0200 Subject: [PATCH 55/94] add database models documentation --- documentation/Corpus-database-models.md | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 documentation/Corpus-database-models.md diff --git a/documentation/Corpus-database-models.md b/documentation/Corpus-database-models.md new file mode 100644 index 000000000..fecb2b86d --- /dev/null +++ b/documentation/Corpus-database-models.md @@ -0,0 +1,49 @@ +# Corpus database models + +As discussed in [corpus definitions](/documentation/Corpus-definitions.md), corpora can be defined in a Python class that is partially represented in the database, or be database-only. This document provides more detail about the structure of the database and how Python corpora are imported. + +## Database models + +A full corpus definition is represented in four models: + +- `Corpus` - the main reference point for the corpus +- `CorpusConfiguration` - has a one-to-one relationship with `Corpus` and represents all configured metadata +- `Field` - has a one-to-many relationship with `CorpusConfiguration` and represents a field in the corpus. +- `CorpusDocumentationPage` - has a many-to-many relationship with `CorpusConfiguration` and represents documentation for users. + +These are defined in [/backend/addcorpus/models.py](/backend/addcorpus/models.py). + +### Corpus vs. CorpusConfiguration + +The distinction between a `Corpus` and its `CorpusConfiguration` is that the configuration is less stable. The `CorpusConfiguration` (and its related `Field` instances) is completely overwritten when you import a corpus defintion. + +On the other hand, the `Corpus` contains information about corpus access that is not covered by the definition. It also serves as the reference point for relationships with other models, such as search history, downloads, and document tags. + + ## Importing Python corpora + +Python definitions can be loaded into the database with the `loadcorpora` command in the backend. Normally, this is run when you start the server, so you do not need to run it manually. + +This command will parse any configured python corpora and save a database representation for them. If the python corpus cannot be loaded, the `Corpus` object will still exist in the database, but it will be inactive. + +If a corpus by the same name already exists in the database, the command will completely overwrite its `CorpusConfiguration` and `Field` instances. This means that, aside from adjusting permissions, changing the database representation of a corpus with a Python definition is always temporary. If you want to make permanent changes to the corpus, adjust the Python definition and run `loadcorpora` again. + +## Corpus visibility + +Corpora have an `active` status that determines whether they are available for searching. In addition, you can configure the `groups` connected to a corpus, which determines who has access to it. A user will see a corpus if it is active and they are in a group that is given access. (A superuser implicitly has access to all active corpora.) + +I-analyzer always includes a group named `'basic'`, which everyone is a member of by default, including anonymous users. So if you want a corpus to be public, add this group to it. + +While a corpus is inactive, its validation is less strict. This allows you to build a database-only corpus in steps, and save an incomplete definition as a work in progress. See [Corpus validation](/documentation/Corpus-validation.md) for more details. + +### Disabling and deleting Python corpora + +If you want to remove a Python corpus from your environment, or transition it to a database-only corpus, remove it from the Django settings. + +Removing a corpus from the settings will not delete the `Corpus` object. It has the following effect: + +- The python definition will no longer be imported during startup. +- The next time you run `loadcorpora`, the properties `corpus.active` and `corpus.has_python_definition` are set to `False`. As the corpus is now inactive, it will be hidden from the search interface. + +Since the underlying `Corpus` is not actually deleted, related search history, downloads, tags, and permissions will be preserved. If you reinstate the corpus in settings, all of these will function as before. + +At this point, you can also remove the `Corpus` object completely, which will remove all related data. From 7efc690d18d11697f242059cf46259ac8da0f34d Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 14 May 2024 16:29:18 +0200 Subject: [PATCH 56/94] rename documentation file --- documentation/README.md | 3 ++- ...o-Ianalyzer.md => Writing-a-corpus-definition-in-Python.md} | 0 2 files changed, 2 insertions(+), 1 deletion(-) rename documentation/{How-to-add-a-new-corpus-to-Ianalyzer.md => Writing-a-corpus-definition-in-Python.md} (100%) diff --git a/documentation/README.md b/documentation/README.md index eb10515b9..3e15c2149 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -11,7 +11,8 @@ This directory contains documentation for developers. ## Adding corpora - [Corpus definitions](./Corpus-definitions.md) -- [How to add a new corpus to I-analyzer](./How-to-add-a-new-corpus-to-Ianalyzer.md) +- [Corpus database models](/Corpus-database-models.md) +- [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) - [Defining corpus fields](./Defining-corpus-fields.md) - [Indexing corpora](./Indexing-corpora.md) - [Indexing in deployment](./Indexing-on-server.md) diff --git a/documentation/How-to-add-a-new-corpus-to-Ianalyzer.md b/documentation/Writing-a-corpus-definition-in-Python.md similarity index 100% rename from documentation/How-to-add-a-new-corpus-to-Ianalyzer.md rename to documentation/Writing-a-corpus-definition-in-Python.md From 8494f0a72640487a33cf117dc0d53fbec5e0b150 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 12:55:50 +0200 Subject: [PATCH 57/94] update corpus definition documentation --- .../Writing-a-corpus-definition-in-Python.md | 142 +++++++++--------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/documentation/Writing-a-corpus-definition-in-Python.md b/documentation/Writing-a-corpus-definition-in-Python.md index 824e54790..f6756e8f0 100644 --- a/documentation/Writing-a-corpus-definition-in-Python.md +++ b/documentation/Writing-a-corpus-definition-in-Python.md @@ -1,6 +1,8 @@ -# How to add a new corpus to I-analzyer +# Writing a corpus definition in Python -The steps of adding a new corpus are usually the following: +This document is a guide to writing a Python corpus definition. + +The steps of adding a new Python corpus are usually the following: - Create a new python class in the I-analyzer repository, which will describe the corpus - Include the corpus in your local django settings and include (local) source data @@ -8,14 +10,19 @@ The steps of adding a new corpus are usually the following: - Create and populate a local elasticsearch index for the corpus - Workshop the corpus definition, add unit tests - Make a pull request -- Create and populate a production elasticsearch index on the test server (using your test branch) -- Include the corpus definition in the next release and deploy it in production +- Create and populate a production elasticsearch index on the production cluster using your test branch. (We use a dedicated I-analyzer instance for indexing.) +- Include the corpus definition in the next release and deploy it in production. - Verify everything works as expected and adjust the corpus permissions in the production admin interface, so users can see it. ## Corpus definition -Adding a new corpus starts by adding a new corpus description `corpusname.py` to the `backend/corpora` directory. The corpus description imports global variables from `backend/ianalyzer/settings.py`. The definition file should be listed under `CORPORA` in the settings. In a development environment, this should happen in `backend/ianalyzer/settings_local.py`. More on the use of settings below. -The corpus definition is a python class definition, subclassing the `CorpusDefinition` class (found in `addcorpus/corpus.py`). You will normally use a datatype-specific subclass of `CorpusDefinition`, like this: +Start by adding a new Python module `corpusname.py` to the `backend/corpora` directory, and include in the `CORPORA` setting of your Django settings. (Use `settings_local.py` to set this for your own development server only.) + +The actual definition is a class that you define in this module. It should subclass the [`CorpusDefinition` class](/backend/addcorpus/python_corpora/corpus.py). This class includes some default values for attributes and default behaviour. + +It also inherits the `Reader` class from [`ianalyzer_readers`](https://ianalyzer-readers.readthedocs.io/en/latest/) which provides very minimal functionality for reading source files. Most corpus definitions also inherit from a more specific `Reader` that provides functionality for the type of source data, e.g. `XMLReader`, `CSVReader`, etc. For convenience, you can use the classes `XMLCorpusDefinition`, `CSVCorpusDefinition`, etc., defined in [corpus.py](/backend/addcorpus/python_corpora/corpus.py). See [the documentation of ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/) for the available `Reader` classes and the API for each of them. + +Your definition module should now look something like this: ```python from addcorpus.corpus import CSVCorpusDefinition @@ -24,65 +31,86 @@ class MyCorpus(CSVCorpusDefinition): pass ``` -The `CorpusDefinition` classes inherit functionality from the package `ianalyzer_readers`, which defines more general `Reader` classes to read data from source files. +This class will describe all metadata for the corpus, but various attributes and methods need to be filled in before the corpus is ready for use. -This provides the basis for an I-analyzer corpus that will define how to read the source data, index it to elasticsearch, and present a search interface in the frontend. However, most properties still need to be filled in. +Many of these values will be hard-coded in the definition class, but some will need to be imported from the project settings, because they need to be configurable. (For example, the location of source data.) More on the use of settings below. -The corpus class should define the following properties: +## Attributes and methods -- `title`: Title to be used in the interface. -- `description`: Short description, appears as a subtitle in the interface. -- `data_directory`: Path to the directory containing source files. Always get this from the setttings. You can also set this to `None`; usually because you are getting source data from an API instead of a local directory. -- `min_date`, `max_date`: The minimum and maximum dates for documents. -- `es_index`: the name of the index in elasticsearch. -- `image`: a path or url to the image used for the corpus in the interface. -- `fields`: a list of `Field` objects. See [defining corpus fields](./Defining-corpus-fields.md). -- `languages`: a list of ISO 639 codes of the languages used in your corpus. Corpus languages are intended as a way for users to select interesting datasets, so only include languages for which your corpus contains a meaningful amount of data. The list should go from most to least frequent. -- `category`: the type of data in the corpus. The list of options is in `backend/addcorpus/constants`. +### Required attributes -The following properties are optional: -- `es_alias`: an alias for the index in elasticsearch. -- `es_settings`: overwrites the `settings` property of the elasticsearch index. Can be generated using [es_settings.py](../backend/addcorpus/es_settings.py) -- `scan_image_type`: the filetype of scanned documents, if these are included. -- `allow_image_download` -- `document_context`: specifies fields that define the natural grouping of documents. -- `default_sort`: specifies the default method to sort search result. -- `language_field`: if your corpus contains documents in multiple language, you can specify the name of the field that stores the IETF tag for each document. +The following attributes are required for a corpus to function. -Several additional attributes allow you to specify files containing documentation; see [including documentation files](#including-documentation-files). +| Attribute | Type | Description | +|-----------|------|-------------| +| `title` | `str` | Title to be used in the interface. | +| `description` | `str` | Short description; appears as a subtitle in the interface. | +| `min_date` | `datetime.date` | The minimum date for the data in the corpus. This is shown as metadata in the corpus overview. It is not used to restrict the data. | +| `max_date` | `datetime.date` | The maximum date for the data - analogous to `min_date`. | +| `category` | `str` | The type of data in the corpus. See the [options for categories](/backend/addcorpus/constants.py). | +| `languages` | `List[str]` | A list of IETF tags of the languages used in your corpus. Corpus languages are intended as a way for users to select interesting datasets, so only include languages for which your corpus contains a meaningful amount of data. The list should go from most to least frequent. You can also include `''` for "unknown". | +| `es_index` | `str` | The name of the elasticsearch index. In development, the corpus name will do. On a production cluster, you may need to use a particular prefix. | +| `data_directory` | `Optional[str]` | Path to the directory containing source files. Always get this from the setttings. You can also set this to `None`; usually because you are getting source data from an API instead of a local directory. | +| `fields` | `List[Field]` | The fields for the corpus. See [defining corpus fields](./Defining-corpus-fields.md). | -The corpus class should also define a function `sources(self, start, end)` which iterates source files (presumably within on `data_directory`). The `start` and `end` properties define a date range: if possible, only yield files within the range. Each source file should be tuple of a filename and a dict with metadata. +### Required methods -### Different types of readers +The corpus class must define a method `sources(self, **kwargs)`. See the [API documentation of ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/stable/api/). When you run the indexing command, I-analyzer can provide two named arguments, `start` and `end`, which give a minimum and maximum date to select source files. -The `CorpusDefinition` class is a subclass of the `Reader` in `ianalyzer_readers`. `Reader` is a base class that does not provide much for data extraction. +### Optional attributes -Most corpus definitions also inherit from a more specific `Reader` that provides functionality for the type of source data, e.g. `XMLReader`, `CSVReader`, etc. For convenience, you can use the classes `XMLCorpusDefinition`, `CSVCorpusDefinition`, etc., defined in [corpus.py](/backend/addcorpus/python_corpora/corpus.py). +| Attribute | Type | Description | +| `image` | `str` | The filename of the image used for the corpus in the interface. (See below.) | +| `es_alias` | `str` | An alias that you want to assign to the index in elasticsearch. | +| `es_settings` | `Dict` | Customises the settings of the elasticsearch index. Can be generated using [es_settings.py](../backend/addcorpus/es_settings.py) | +| `scan_image_type` | `str` | The MIME type of media attachments to documents, if these are included. | +| `allow_image_download` | `bool` | If the corpus has media attachments, this controls if they can be downloaded from the interface. | +| `document_context` | `Dict` | Defines how to group documents into a "context". For example, if each document is a page, you can configure this setting so users can view a book. See the docstring of this attribute for details. | +| `default_sort` | `Dict` | Defines the default method to sort search results if the user has not entered a query. See the docstring of this attribute for details. | +| `language_field` | `str` | If the corpus contains documents in multiple languages, this can specify the name of the field that stores the IETF tag for each document. | +| `description_page` | `str` | Name of the file with a general description of the corpus. See below. | +| `citation_page` | `str` | Name of the file with citation guidelines. See below. | +| `wordmodels_page` | `str` | Name of the file documenting word models. See below. | +| `license_page` | `str` | Name of the file containing a licence for the data. See below. | +| `terms_of_service_page` | `str` | Name of the file containing terms of service. See below. | -See [the documentation of ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/) for the available `Reader` classes and the API for each of them. +### Documentation files and corpus image -### Including documentation files +If you include a corpus image or documentation pages, these need to be included as separate files. -Documentation pages can be added as markdown files in the corpus directory. See [corpus documentation](/documentation/Corpus-documentation.md) for more information about writing these files. +Each file should be located in a specific subdirectory of the directory that contains your definition module. Specifically: -The method `documentation_path(page_type)` on the `CorpusDefinition` class points to the files that are included. It takes a documentation type as input and returns the path to the file, relative to the directory containing the definition. +| Filename attribute | Subdirectory | +|--------------------|--------------| +| `image` | `images` | +| `description_page` | `description` | +| `citation_page` | `citation` | +| `wordmodels_page` | `wm` | +| `license_page` | `license` | +| `terms_of_service_page` | `terms_of_service` | -The default implementation of `documentation_path` will look at the following attributes of the corpus: +This means that if your corpus includes `image = 'mycorpus.jpg'`, your directory should be structured like this: -- `description_page`: a path relative to `./description/` -- `citation_page`: a path relative to `./citation/` -- `wordmodels_page`: a path relative to `./wm/` -- `license_page`: a path relative to `./license/` -- `terms_of_service_page`: a path relative to `./terms_of_service/` +``` +mycorpus/ +├ mycorpus.py +└ images/ + └ mycorpus.jpg +``` + +The image can be any image file. Documentation pages must be markdown files. See [corpus documentation](/documentation/Corpus-documentation.md) for more information about writing documentation. -## Settings file +## Using project settings -The django settings can be used to configure variables that may depend on the environment. Please use the following naming convention when you add settings for your corpus. +Several of the attributes in a corpus definition need to be configurable per environment. This is done by including these values in the project settings. + +Please use the following naming convention when you add settings for your corpus. ```python CORPUSNAME_DATA = '/MyData/CorpusData' # the directory where the xml / html or other files are located CORPUSNAME_ES_INDEX = 'dutchbanking' # the name that elasticsearch gives to the index CORPUSNAME_SCAN_IMAGE_TYPE = 'image/png' #mimetype of document media +# etc... ``` These can be retrieved in the corpus definition, for example: @@ -97,35 +125,13 @@ class Times(XMLCorpus): max_date = datetime(year=2010, month=12, day=31) data_directory = settings.TIMES_DATA es_index = getattr(settings, 'TIMES_ES_INDEX', 'times') - ... + # ... ``` -Note that for a property like the elasticsearch index, we define a default value but make it possible to override this in the settings file. - -### Corpus selection - -To include the new corpus in an instance of I-analyzer, the project settings must be adjusted. - -The [CORPORA setting](/documentation/Django-project-settings.md#corpora) must be updated to include the corpus in your project. - -Additionally, you can specify an elasticsearch server for the corpus with the [CORPUS_SERVER_NAMES setting](/documentation/Django-project-settings.md#corpus_server_names). - - -## Elasticsearch -Once the corpus definition and associated settings are added, the only remaining step is to make the Elasticsearch index. By running `yarn django index corpusname`, information is extracted and sent to Elasticsearch. -Optional flags: -- `-s 1990-01-01` sets different start date for indexing -- `-e 2000-12-31` sets different end data for indexing -- `-d` specifies that an existing index of the same name should be deleted first (if not specified, defaults to false, meaning that extra data can be added while existing data is not overwritten) - -The start and end date flags are passed on the `sources` function of the corpus (see above). If you did not utilise them there, they will not do anything. - -## Validation - -The `CorpusDefinition` class has no built-in validation. However, once you start using the corpus, many of the properties defined in the python class will be loaded into the `CorpusConfiguration` database model. This step does include some validation, so it may raise errors. You can run the import script with `yarn django loadcorpora`. It is also run when you start a development server with `yarn start-back`. +Note that for a property like the elasticsearch index, we define a default value but make it possible to override this in the settings file, while the data directory is required. ## Unit testing -It is strongly recommended that you include unit tests for your corpus. A minimal test is to try to load your corpus into the database. In addition, it is recommended that you include some tests that check the output of the data extraction. +It is strongly recommended that you include unit tests for your corpus. A minimal test is to try to load your corpus into the database and check that it does not run into validation errors. In addition, it is recommended that you include some tests that check the output of the data extraction. The rechtspraak corpus includes good examples of such tests. From 6f0a533a856c6df1319921b49f05bb8b109d8f10 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 13:36:24 +0200 Subject: [PATCH 58/94] expand docstring for FieldDefinition --- backend/addcorpus/python_corpora/corpus.py | 103 +++++++++++++-------- 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/backend/addcorpus/python_corpora/corpus.py b/backend/addcorpus/python_corpora/corpus.py index 2ce7fe663..a82766ed5 100644 --- a/backend/addcorpus/python_corpora/corpus.py +++ b/backend/addcorpus/python_corpora/corpus.py @@ -2,7 +2,7 @@ Module contains the base classes from which corpora can derive; ''' -from typing import Optional +from typing import Optional, List, Dict from ianalyzer_readers import extract from datetime import datetime from os.path import isdir @@ -16,6 +16,8 @@ from ianalyzer_readers.readers.html import HTMLReader from ianalyzer_readers.readers.xlsx import XLSXReader +from addcorpus.python_corpora.filters import Filter + import logging logger = logging.getLogger('indexing') @@ -352,49 +354,68 @@ class FieldDefinition(Field): Definition for a single field in a corpus. Parameters: - name: a short hand name - display_name: the name shown to the user - display_type: how the field should be displayed in the client - description: an explanation of the field for users - indexed: whether the field is skipped during indexing - hidden: whether the field is hidden in the frontend - results_overview: whether the field appears in the preview of a document - csv_core: whether the field is pre-selected for CSV downloads - search_field_core: whether the field is immediately shown in field selection. - If `False`, the field is only shown when the user selects "show all fields". - visualizations: visualisations that are available for this field. Options: - resultscount, termfrequency, wordcloud, ngram. - es_mapping: the mapping of the field in Elasticsearch - language: the language of the field's content. Can be `None`, an IETF tag, or - `"dynamic"`. - search_filter: configuration of the search filter used for the field. - extractor: configuration to extract the field's data from source documents - sortable: whether this field is shown as an option to sort search results. - searchable: whether this field is shown in the selection for search fields. - downloadable: whether this field may be included when downloading results. - required: whether this field is required during source extraction. + name: A shorthand name. Must be a slug. + display_name: The name that should be shown to the user. If you leave this out, + the `name` will be used. + display_type: Determines how the field should be rendered. This can be any + supported elasticsearch mapping type, `'url'`, or `'text_content'`. If you + leave this blank, the mapping type of `es_mapping` will be used, so this + only needs to be specified for URL and text content fields. + description: An explanation of the field for users. + indexed: Whether the field is skipped during source extraction and indexing. + hidden: Whether the field is hidden in the frontend. + results_overview: Whether the field appears in the preview of a document. + csv_core: Whether the field is pre-selected for CSV downloads of search results. + search_field_core: Whether the field is immediately shown in field selection for + the search query. If `False`, the field is only shown when the user selects + "show all fields". + visualizations: Visualisations that are enabled for this field. Options: + resultscount, termfrequency, wordcloud, ngram. For date fields and + categorical/ordinal fields (usually keyword type), you can use + `['resultscount', 'termfrequency']`. For text fields, you can use + `['wordcloud', 'ngram']`. However, the ngram visualisation also requires that + the corpus has a date field. + visualization_sort: If the visualisations include resultscount or termfrequency + and the field is not a date field, this determines how the histogram is + sorted. Options are `'value'` (sort from most to least frequent) or `'key'` + (sort alphabetically). + es_mapping: The mapping of the field in Elasticsearch. It's recommended to use one + of the functions in `addcorpus.es_mappings` to construct this. + language: The language of the field's content. Can be `None`, an IETF tag, or + `"dynamic"`. None means the language is unknown or NA. Dynamic means the + `language_field` of the corpus specifies the IETF tag for this field's + language per document. + search_filter: Configuration of the search filter used for the field (if any). + Should be `Filter` instance. + extractor: Configuration to extract the field's data from source documents. Should + be an `Extractor` instance. + sortable: Whether this field is shown as an option to sort search results. + searchable: Whether this field is shown in the selection for search fields. + downloadable: Whether this field may be included when downloading results. + required: Whether this field is required during source extraction. Note that not + all Reader subclasses currently support this. ''' def __init__(self, - name=None, - display_name=None, - display_type=None, - description='', - indexed=True, - hidden=False, - results_overview=False, - csv_core=False, - search_field_core=False, - visualizations=[], - visualization_sort=None, - es_mapping={'type': 'text'}, - language=None, - search_filter=None, - extractor=extract.Constant(None), - sortable=None, - searchable=None, - downloadable=True, - required=False, + name: str = None, + display_name: Optional[str] = None, + display_type: Optional[str] = None, + description: str = '', + indexed: bool = True, + hidden: bool = False, + results_overview: bool = False, + csv_core: bool = False, + search_field_core: bool = False, + visualizations: List[str] = [], + visualization_sort: str = None, + es_mapping: Dict = {'type': 'text'}, + language: Optional[str] = None, + search_filter: Optional[Filter] = None, + extractor: extract.Extractor = extract.Constant(None), + sortable: Optional[bool] = None, + searchable: Optional[bool] = None, + downloadable: bool = True, + required: bool = False, **kwargs ): From aec3fbf0f55d9d5c0fa081c99385136f7cc502bf Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 14:13:15 +0200 Subject: [PATCH 59/94] update python definition guide --- documentation/Defining-corpus-fields.md | 63 ------------------- documentation/Elasticsearch-index-settings.md | 48 ++++++++++++++ documentation/README.md | 2 +- .../Writing-a-corpus-definition-in-Python.md | 16 ++++- 4 files changed, 64 insertions(+), 65 deletions(-) delete mode 100644 documentation/Defining-corpus-fields.md create mode 100644 documentation/Elasticsearch-index-settings.md diff --git a/documentation/Defining-corpus-fields.md b/documentation/Defining-corpus-fields.md deleted file mode 100644 index 8ad29a7b7..000000000 --- a/documentation/Defining-corpus-fields.md +++ /dev/null @@ -1,63 +0,0 @@ -# Defining corpus fields - -Each corpus has a number of fields, which are extracted from source data. Each field is defined as a `Field` object, which defines how that field should be extracted from the source file, how it should be stored in elasticsearch, and how it should appear in the interface. See [corpus.py](../backend/addcorpus/corpus.py) for the class definition. - -## Extracting values - -The `extractor` attribute of a field should define how it extracts its data from source files. This value should be an instance of `Extractor`, which is defined in the `ianalyzer_readers` package. See [the API documentation of ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/api/#extractors) for a list of available extractors and their parameters. - -These extractors are typically sufficient for new corpora; if they are not, you can create a custom `Extractor` subclass for your corpus, or expand the `ianalyzer_readers` package. - -## Elasticsearch mapping - -Each field should specify its `es_mapping`, a dict that is passed on to elasticsearch to specify how it is indexed. See the [elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html). For common mappings, use the functions defined in [es_mappings.py](../backend/addcorpus/es_mappings.py) - -The property `indexed` determines whether the field should be skipped during source extraction. - -### Multifields - -Elasticsearch supports specifying a `fields` parameter to a field to define subfields, allowing a hierarchical structure to fields. I-analyzer is designed for all fields to exist on the same level, so subfields will not be visible in the interface. - -The one way in which multifields _are_ used is to allow different analyzers on the same text field. Text fields typically use the default analyzer, which performs basic tokenisation and converts text to lowercase. For more extensive analysis, subfields can be added. I-analyzer uses the following naming convention: - -- `*.clean_{language_string}`: uses a language-specific analyzer to filter stopwords. It has a suffix indicating the language this analyzer is for, e.g., `*.clean_en` for English. -- `*.stemmed_{language_string}`: uses a language-specific analyzer to filter stopwords and stem words. It has a suffix indicating the language this analyzer is for, e.g., `*.stemmed_en` for English. -- `*.length`: specifies the token count of the text, which is useful for aggregations. -- `*.text`: a field with text mapping. Can be added to a keyword field to support full-text search in the field. - -If you add fields with these names to the `es_mapping` of a text field, it enables some features in visualisations. If you add a multifield with these names that does not contain the expected type of data, some visualisations may not work. Do not do this. - -All of these multifields can be created through the functions in `es_mappings.py`. - -## Interface parameters - -The following properties determine how a field appears in the interface. - -`display_name` and `description` are optional and determine how a field is described in the interface. If you do not set `display_name`, the `name` property will be used instead. - -`display_type`: set to `text_content` if this is the main text. It will appear as such in the results overview. - -`results_overview` determines if a field should be included in the initial overview of a document in the results page. `hidden` determines if a field should be displayed when a user clicks on a result to see the complete document. - -`search_filter` can be set if the interface should include a search filter widget for the field. I-analyzer includes date filters, multiplechoice filters (used for keyword data), range filters, and boolean filters. See [filters.py](../backend/addcorpus/filters.py). - -`visualizations` optionally specifies a list of visualisations that apply for the field. Generally speaking, this is based on the type of data. For date fields and categorical/ordinal fields (usually keyword type), you can use `['resultscount', 'termfrequency']`. For text fields, you can use `['wordcloud', 'ngram']`. However, the ngram visualisation also requires that your corpus has a date field. - -If a field includes the `'resultscount'` and/or `'termfrequency'` visualisations and it is not a date field, you can also specify `visualisation_sort`, which determines how to sort the x-axis of the graph. Default is `'value'`, where categories are sorted based on the y-axis value (i.e., frequency). You may specify that they should be sorted on `'key'`, so that categories are sorted alphabetically (for keywords) or small-to-large (for numbers). - -`search_field_core` determines if a field is listed by default when selecting specific fields to search in. If it is not set to `True`, the user would have to click on "show all fields" to see it. - -`csv_core` determines if a field is included in the CSV download of search results by default. - -`sortable` determines whether a field should appear as a sort option. - -### Language - -For text and keyword fields, you can set the language of the field as follows: - -`language` specifies the language of the fields contents. Acceptable values are: -- `None`; use this if the langauge is unknown or not applicable (e.g. for numbers or dates) -- an [IETF language tag](https://en.wikipedia.org/wiki/IETF_language_tag); use this if the language is a constant -- `'dynamic'`; use this if the language is not always the same, and the IETF tag is stored in the `language_field` of the corpus. - -This language metadata is used to set the `lang` property of the DOM element in the interface. Because I-analyzer is an application for text analysis, it may have other uses in the future. diff --git a/documentation/Elasticsearch-index-settings.md b/documentation/Elasticsearch-index-settings.md new file mode 100644 index 000000000..f9f4b25e9 --- /dev/null +++ b/documentation/Elasticsearch-index-settings.md @@ -0,0 +1,48 @@ +# Elasticsearch index settings + +This document details how elasticsearch indices for I-analyzer corpora are configured. + +## Field mappings + +Each corpus field passes a _mapping_ to elasticsearch that determines how the field is indexed - which affects what kind of queries it supports. See the [elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html) to read more about mappings in general. + +I-analyzer supports a limited number of mapping types, namely: + +- text +- keyword +- integer +- float +- boolean +- date +- date range +- geo point + +Field definitions use the constructor functions defined in [es_mappings.py](../backend/addcorpus/es_mappings.py), so mappings are not normally defined directly in a corpus definition. + +### Multifields + +Elasticsearch supports specifying a `fields` parameter to a field, which allows the same value to function with multiple mappings. This allows the application to offer multiple options for a field that would otherwise be incompatible. For example, a field can be treated as categorical (e.g. support a histogram visualisation), while still allowing full-text search. + +The names of multifields are standardised in the application, and come with expectations about what that multifield does.If you add a multifield with these names that does not contain the expected type of data, some functionality may break. Do not do this. + +Multifields are never required, but they enable certain features in visualisations and the search interface. + +For *text* fields, multifields are used to enable different analysers. Normally, the main analyser for a field will only perform basic tokenisation and lowercasing. Multifields are used for more extensive language analysis: + +- `*.clean_{language_string}`: uses a language-specific analyser to filter stopwords. It has a suffix indicating the language this analyser is for, e.g., `*.clean_en` for English. +- `*.stemmed_{language_string}`: uses a language-specific analyser to filter stopwords and stem words. It has a suffix indicating the language this analyser is for, e.g., `*.stemmed_en` for English. +- `*.length`: specifies the token count of the text, which is useful for aggregations. + +For *keyword* fields, a multifield can be added to support full-text search: + +- `*.text`: uses a text mapping to support full-text search in the field. + +## Analysers + +If the field mappings include fields with `clean*`, `stemmed*`, or `length` multifields, the elasticsearch settings for the corpus must define the appropriate analysers. The module [es_settings.py](/backend/addcorpus/es_settings.py) contains a function `es_settings` to generate the settings. + +See [elasticsearch documentation about language analysers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-overview.html). + +The `clean*` and `stemmed*` analysers specify a stopword list, which is based on the stopwords corpus from [NLTK](https://www.nltk.org/). This analysis also includes a character filter to remove numbers. + + diff --git a/documentation/README.md b/documentation/README.md index 3e15c2149..d4bf977a4 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -13,7 +13,7 @@ This directory contains documentation for developers. - [Corpus definitions](./Corpus-definitions.md) - [Corpus database models](/Corpus-database-models.md) - [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) -- [Defining corpus fields](./Defining-corpus-fields.md) +- [Elasticsearch index settings](./Elasticsearch-index-settings.md) - [Indexing corpora](./Indexing-corpora.md) - [Indexing in deployment](./Indexing-on-server.md) - [Corpus validation](./Corpus-validation.md) diff --git a/documentation/Writing-a-corpus-definition-in-Python.md b/documentation/Writing-a-corpus-definition-in-Python.md index f6756e8f0..4ff0f3cbc 100644 --- a/documentation/Writing-a-corpus-definition-in-Python.md +++ b/documentation/Writing-a-corpus-definition-in-Python.md @@ -51,7 +51,7 @@ The following attributes are required for a corpus to function. | `languages` | `List[str]` | A list of IETF tags of the languages used in your corpus. Corpus languages are intended as a way for users to select interesting datasets, so only include languages for which your corpus contains a meaningful amount of data. The list should go from most to least frequent. You can also include `''` for "unknown". | | `es_index` | `str` | The name of the elasticsearch index. In development, the corpus name will do. On a production cluster, you may need to use a particular prefix. | | `data_directory` | `Optional[str]` | Path to the directory containing source files. Always get this from the setttings. You can also set this to `None`; usually because you are getting source data from an API instead of a local directory. | -| `fields` | `List[Field]` | The fields for the corpus. See [defining corpus fields](./Defining-corpus-fields.md). | +| `fields` | `List[Field]` | The fields for the corpus. See [defining fields](#definining-fields). | ### Required methods @@ -100,6 +100,20 @@ mycorpus/ The image can be any image file. Documentation pages must be markdown files. See [corpus documentation](/documentation/Corpus-documentation.md) for more information about writing documentation. +## Definining fields + +The `fields` property lists the configuration for each field in the corpus. Each of these defines how that field should be extracted from the source file, how it should be stored in elasticsearch, and how it should appear in the interface. See [corpus.py](../backend/addcorpus/corpus.py) for the class definition. + +Note that unlike with `CorpusDefinition`, fields are not defined as _classes_ but as _objects_. Rather than creating a custom subclass of `FieldDefinition`, you can just call the `FieldDefinition()` constructor with appropriate parameters. + +See the docstring of `FieldDefinition` for a comprehensive overview of all parameters. + +### Extracting values + +The `extractor` attribute of a field should define how it extracts its data from source files. This value should be an instance of `Extractor`, which is defined in the `ianalyzer_readers` package. See [the API documentation of ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/api/#extractors) for a list of available extractors and their parameters. + +These extractors are typically sufficient for new corpora; if they are not, you can create a custom `Extractor` subclass for your corpus, or expand the `ianalyzer_readers` package. + ## Using project settings Several of the attributes in a corpus definition need to be configurable per environment. This is done by including these values in the project settings. From 4d8ae94532e59d674c9f8dcdcc9d5a82806c80ba Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 14:17:35 +0200 Subject: [PATCH 60/94] Update indexing corpora documentation --- documentation/Indexing-corpora.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/documentation/Indexing-corpora.md b/documentation/Indexing-corpora.md index 0af546578..56e019b1a 100644 --- a/documentation/Indexing-corpora.md +++ b/documentation/Indexing-corpora.md @@ -1,12 +1,11 @@ # Indexing corpora -Indexing is the step to load corpus data into elasticsearch, which makes the data available through the I-analyzer interface. +Indexing is the step to read the source data of the corpus and load it into elasticsearch, which makes the data available through the I-analyzer interface. -You can start indexing once you have -- A python source file for the corpus -- A directory with source data -- Added the necessary properties to django settings -- Run the admin command `loadcorpora` (`yarn django loadcorpora`) +You can start indexing once you have: +- Created a definition for the corpus +- If it is a Python corpus: added necessary settings to your project, such as the source data directory. +- Imported the definition into the database. For Python corpora, run `yarn django loadcorpora` to do this. The basic indexing command is: @@ -28,10 +27,7 @@ Some options that may be useful for development: ### Date selection -`--start` / `-s` and `--end` / `-e` respectively give a start and end date to select source files. Note that this only works if the `sources` function in your corpus definition makes use of these options; not all corpora have this defined. (It is not always possible to infer exact dates from source file metadata.) - -The filtering of source files may not be exact (e.g. only take the year into account). These flags do *not* filter documents based on their contents. - +`--start` / `-s` and `--end` / `-e` respectively give a start and end date to select source files. Note that this only works if the `sources` function in your corpus definition makes use of these options; not all corpora have this defined. (It is not always possible to infer dates from source file metadata without parsing the file.) ## Production From 389e283b9287d0a488f74ff9f73576a8e50c08c8 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 14:21:58 +0200 Subject: [PATCH 61/94] Update indexing on server documentation --- documentation/Indexing-on-server.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/documentation/Indexing-on-server.md b/documentation/Indexing-on-server.md index 7cad303e1..a0c0b379c 100644 --- a/documentation/Indexing-on-server.md +++ b/documentation/Indexing-on-server.md @@ -6,8 +6,15 @@ For production environments, we use *versioned* index names (e.g. `times-1`, `ti ## Moving data to server On the server, move data to a location in the `/its` share. -## Deployment settings -In the Deployment repository, set the variables of the corpus. +## Import the corpus + +### Database-only corpora + +Add the corpus to the database of the indexing server. You can use the JSON export/import to do this easily. After importing the JSON representation, use the Django admin menu to add the data directory for the corpus (the path to the data on the `/its` share). + +### Python corpora + +If the corpus is based on a Python definition, adjust the deployment repository to include necessary settings. At the very least, you should set: - `YOUR_CORPUS_DATA` the location on the `/its` share. From 32d1a7c3f2076617ca62bbee6c92a1d42f66affe Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 14:26:26 +0200 Subject: [PATCH 62/94] Update corpus validation documentation --- documentation/Corpus-validation.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/documentation/Corpus-validation.md b/documentation/Corpus-validation.md index 02fe13344..42942bea7 100644 --- a/documentation/Corpus-validation.md +++ b/documentation/Corpus-validation.md @@ -6,15 +6,17 @@ A corpus has two additional checks for different stages of its creation: _ready ## Ready to index -A corpus that does not meet this check cannot be indexed in Elasticsearch. This can mean that it's missing essential fields. +A corpus that does not meet this check cannot be indexed in Elasticsearch. This can mean that it's missing essential fields, has no source data directory configured, etc. A corpus must pass this check when you use the `index` command. ## Ready to publish -A corpus that does not meet this check cannot be searched in the frontend. This usually means the interface configuration is incomplete or invalid. However, a corpus must also be "index-ready". +A corpus that does not meet this check cannot be searched in the frontend. This usually means the interface configuration is incomplete or invalid. -If a corpus does not pass this check, it won't be included in the API for the search interface. +A corpus must pass this check to be set to `active` - which enables the corpus in the search interface. + +The `ready_to_publish` validation is not used directly when handling views, because it can include some non-trivial checks. For Python corpora, `active` is simply set by running `ready_to_publish()` after importing the corpus definition. ## API @@ -23,9 +25,3 @@ These checks are available as methods on a `Corpus` object. The corpus has two m In addition, the following methods wrap these validation functions in a try/except block and return a boolean value: `corpus.ready_to_index()` and `corpus.ready_to_publish()`. The first two methods are useful if you want feedback, the latter two methods are useful if you just need a binary state. - -## Activity state - -The `corpus.ready_to_publish()` is not used directly when handling views, because it can include some non-trivial checks. Instead, corpora have a database field `active` which is checked when responding to HTTP requests. - -For Python corpora, `active` is simply set by running `ready_to_publish()` after importing the corpus definition. From c1f40c9d375a4694939ddcb087c167a7d1d9e45a Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 14:55:07 +0200 Subject: [PATCH 63/94] improve documentation for JSON definitions --- backend/addcorpus/schemas/corpus.schema.json | 71 ++++++++++--------- .../Writing-a-corpus-definition-in-JSON.md | 14 ++++ 2 files changed, 52 insertions(+), 33 deletions(-) create mode 100644 documentation/Writing-a-corpus-definition-in-JSON.md diff --git a/backend/addcorpus/schemas/corpus.schema.json b/backend/addcorpus/schemas/corpus.schema.json index dda75940c..41505bb27 100644 --- a/backend/addcorpus/schemas/corpus.schema.json +++ b/backend/addcorpus/schemas/corpus.schema.json @@ -2,7 +2,7 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://github.com/UUDigitalHumanitieslab/I-analyzer/blob/develop/backend/addcorpus/schemas/corpus.schema.json", "title": "Corpus", - "description": "A corpus on I-analyzer", + "description": "Definition of a corpus in I-analyzer", "type": "object", "properties": { "name": { @@ -23,7 +23,7 @@ }, "languages": { "type": "array", - "description": "IETF tags of languages used in the content", + "description": "IETF tags of languages used in the content. List as most to least frequent.", "items": { "type": "string" }, @@ -32,7 +32,7 @@ }, "category": { "type": "string", - "description": "nature of the content", + "description": "Nature of the content", "enum": [ "parliament", "periodical", @@ -47,7 +47,7 @@ }, "date_range": { "type": "object", - "description": "the date range of the content", + "description": "The date range of the content", "properties": { "min": { "type": "string", @@ -70,20 +70,20 @@ }, "source_data": { "type": "object", - "description": "information about the source data files", + "description": "Information about the source data files", "properties": { "type": { "type": "string", - "description": "data type of the source files", + "description": "Data type of the source files", "enum": ["csv"] }, "options": { "type": "object", - "description": "additional options for source files", + "description": "Additional options for source files", "properties": { "delimiter": { "type": "string", - "description": "delimiter between values in the source files", + "description": "Delimiter between values in the source files", "enum": [",", ";", "\t"] } } @@ -93,25 +93,26 @@ }, "fields": { "type": "array", - "description": "list of fields", + "description": "List of fields", "items": { "type": "object", "description": "A field in a corpus", "properties": { "name": { "type": "string", - "description": "internal name" + "description": "Internal name" }, "display_name": { "type": "string", - "description": "human-friendly name" + "description": "Human-friendly name" }, "description": { "type": "string", - "description": "longer description for users" + "description": "Longer description for users" }, "type": { "type": "string", + "description": "The type of data", "enum": [ "text_content", "text_metadata", @@ -128,43 +129,43 @@ "properties": { "search": { "type": "boolean", - "description": "whether the field supports full-text search" + "description": "Whether the field supports full-text search" }, "filter": { "type": "string", - "description": "search filter for the field", + "description": "Whether users can filter results based on this field, and if the filter widget is shown by default", "enum": ["show", "hide", "none"] }, "preview": { "type": "boolean", - "description": "whether the field is included in the preview of a document" + "description": "Whether the field is included in the preview of a document" }, "visualize": { "type": "boolean", - "description": "whether the field is visualised" + "description": "Whether the field is visualised" }, "sort": { "type": "boolean", - "description": "whether search results can be sorted on this field" + "description": "Whether search results can be sorted on this field" }, "hidden": { "type": "boolean", - "description": "whether the field is hidden from the interface" + "description": "Whether the field is hidden from the interface" } }, "required": ["search", "filter", "preview", "visualize", "sort", "hidden"] }, "language": { "type": "string", - "description": "language of the field's content. Either an IETF tag, or \"dynamic\"." + "description": "Language of the field's content. Either an IETF tag, or \"dynamic\"." }, "extract": { "type": "object", - "description": "how to extract this field's value from source files", + "description": "How to extract this field's value from source files", "properties": { "column": { "type": "string", - "description": "name of the column in CSV source files" + "description": "Name of the column in CSV source files" } }, "required": ["column"] @@ -177,31 +178,35 @@ "type": "object", "properties": { "default_sort": { - "description": "default sort settings for search results", + "description": "Default sort settings for search results", "$ref": "#sortSetting" }, "language_field": { "type": "string", - "description": "name of the field that contains the IETF tag of the document's content" + "description": "Name of the field that contains the IETF tag of each document's content" }, "document_context": { "type": "object", - "description": "description of how documents can be grouped", + "description": "Description of how documents can be grouped", "properties": { - "context_field": { - "type": "string", - "description": "name of the field to group by" + "context_fields": { + "type": "array", + "description": "The fields to group by", + "items": { + "type": "string", + "description": "Name of the field" + } }, "display_name": { "type": "string", - "description": "display name of a group, ,e.g. 'book'" + "description": "Display name of a group, ,e.g. 'book'" }, "sort": { - "description": "when showing document context, sort them like this", + "description": "How documents within a group should be sorted", "$ref": "#sortSetting" } }, - "required": ["context_field", "display_name"] + "required": ["context_fields", "display_name"] } } } @@ -211,15 +216,15 @@ "sortSetting": { "$anchor": "sortSetting", "type": "object", - "description": "Describes how to sort search results", + "description": "How to sort search results", "properties": { "field": { "type": "string", - "description": "name of on which to sort" + "description": "Name of the field on which to sort" }, "ascending": { "type": "boolean", - "description": "whether the sort direction is ascending or descending" + "description": "Whether the sort direction is ascending or descending" } }, "required": ["field", "ascending"] diff --git a/documentation/Writing-a-corpus-definition-in-JSON.md b/documentation/Writing-a-corpus-definition-in-JSON.md new file mode 100644 index 000000000..9f93a6a1d --- /dev/null +++ b/documentation/Writing-a-corpus-definition-in-JSON.md @@ -0,0 +1,14 @@ +# Writing a corpus definition in JSON + +Database-only corpora support a JSON format for creating corpus definitions. This format is implemented in the backend API of I-analyzer. Like Python definitions, a JSON definition can be used to store and share a configuration for a corpus. + +The format is defined in [corpus.schema.json](/backend/addcorpus/schemas/corpus.schema.json). + +## Importing and exporting definitions + +Currently, importing and exporting JSON definitions is only supported through the backend API. + +Some notes on importing and exporting JSON definitions: + +- A JSON definition is less detailed than the database model. This is because the database model must also support Python corpora (which offer more customisation) and legacy options. If you edit a corpus through the admin, exporting it to JSON and importing it again may include some normalisation. +- Some properties of the corpus are not handled through the JSON interface, though they are supported in database-only corpora. Currently, these can only be configured in the admin. These are the corpus image, documentation pages, and data directory. From 51d8b16ad0740cacc0c76756438ddac86825273a Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 15:20:44 +0200 Subject: [PATCH 64/94] update first time setup docs --- documentation/First-time-setup.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/documentation/First-time-setup.md b/documentation/First-time-setup.md index d6fd719d3..247215e40 100644 --- a/documentation/First-time-setup.md +++ b/documentation/First-time-setup.md @@ -71,14 +71,31 @@ Note: you can also call the .env file .myenv and specify this during startup: ## Adding corpora -To include corpora on your environment, you need to index them from their source files. The source files are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add it your our environment as follows: +These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in I-analyzer or by other developers. -_Note:_ these instructions are for indexing a corpus that already has a corpus definition. For adding new corpus definitions, see [How to add a new corpus to I-analyzer](./documentation/How-to-add-a-new-corpus-to-Ianalyzer.md). +In a first-time setup, it is recommended that you add at least one existing corpus before creating your own. Documentation on creating new corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) / [Writing a corpus definition in JSON](./Writing-a-corpus-definition-in-JSON.md). + +### Python corpora + +Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch. + +The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows: 1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](/documentation/Django-project-settings.md#corpora). 2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may be optional, but you will at least need to define the (absolute) path to your source files. 3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information. +### Database-only corpora + +Note: database-only corpora are still in development and not yet recommended for first-time users. + +To add a database-only corpus, you will need a JSON definition of the corpus, and a directory with (a sample of) the pre-processed source data. To retrieve a JSON definition from a running I-analyzer server, visit `/api/corpus/edit/` and copy the JSON of the corpus you want to import. + +1. Start up your I-analyzer server. Make a POST request to `localhost:8000/api/corpus/edit/` (you can use the browsable API for this) to import the JSON definition. +2. Visit the admin menu (`localhost:8000/admin`). Go to "corpus configurations" and select your corpus. In the "data directory" field, add the path to your source data directory. +3. Activate your python virutal environment. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information. + + ## Running a dev environment 1. Start your local elasticsearch server. If you installed from .zip or .tar.gz, this can be done by running `{path your your elasticsearch folder}/bin/elasticsearch` From e1cd97db3779ebc617c9c3f7e0e9298790e48e2a Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 15:53:59 +0200 Subject: [PATCH 65/94] minor documentation updates for database-only corpora --- documentation/Adding-word-models.md | 6 ++++-- documentation/Overview.md | 2 +- documentation/Versioning.md | 9 ++++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/documentation/Adding-word-models.md b/documentation/Adding-word-models.md index 8d420c70c..df282502d 100644 --- a/documentation/Adding-word-models.md +++ b/documentation/Adding-word-models.md @@ -1,6 +1,8 @@ # Adding word models -Corpora have the option to include word vectors. I-analyzer visualisations are built for _diachronic_ word models, showing how word meaning changes over time. As such, I-analyzer expects that you trained models for different time intervals. +Corpora have the option to include word vectors. (This option is only supported for Python corpora.) + +I-analyzer visualisations are built for _diachronic_ word models, showing how word meaning changes over time. As such, I-analyzer expects that you trained models for different time intervals. ## Expected file format Word embeddings are expected to come with the following files: @@ -9,7 +11,7 @@ For each time bin, it expects files of the format - `_{startYear}_{endYear}.wv` (contains gensim KeyedVectors for a model trained on the time bin) ## Documentation -Please include documentation on the method and settings used to train a model. This documentation is expected to be located in `wm/documentation.md`, next to the corpus definition that includes word models. +Please include documentation on the method and settings used to train a model. See the separate documentation on [how to include documentation pages](./Writing-a-corpus-definition-in-Python.md#documentation-files-and-corpus-image) and [how to write documenation pages](./Corpus-documentation.md). ## Including word models diff --git a/documentation/Overview.md b/documentation/Overview.md index f8118bd2b..0c1750b09 100644 --- a/documentation/Overview.md +++ b/documentation/Overview.md @@ -8,7 +8,7 @@ The I-analyzer backend (`/backend`) is a python/Django app that provides the fol - A 'users' module that defines user accounts. -- A 'corpora' module containing corpus definitions and metadata of the currently implemented corpora. For each corpus added in I-analyzer, this module defines how to extract document contents from its source files and sets parameters for displaying the corpus in the interface, such as sorting options. +- A 'corpora' module containing corpus definitions and metadata of all corpora that are defined in Python. (Corpora can also be defined as database objects.) For each Python corpus added in I-analyzer, this module defines how to extract document contents from its source files and sets parameters for displaying the corpus in the interface, such as sorting options. - An 'addcorpus' module which manages the functionality to extract data from corpus source files (given the definition) and save this in an elasticsearch index. Source files can be XML or HTML format (which are parsed with `beautifulsoup4` + `lxml`) or CSV. This module also provides the basic data structure for corpora. diff --git a/documentation/Versioning.md b/documentation/Versioning.md index c10f8c1e9..9afbac3bd 100644 --- a/documentation/Versioning.md +++ b/documentation/Versioning.md @@ -29,7 +29,8 @@ A _minor_ release can include: - Layout changes to the frontend that don't remove functionality - Backwards compatible changes to routing in the frontend. A change is backwards compatible if URLs from older versions will still direct to the same content. - New corpus definitions -- Changes to corpus definitions where updating existing definitions is not required, such as new options or source data types +- Changes to the format for JSON corpus definitions where an older definition will continue to function as before. +- Changes to the format for Python corpus definitions, which may require older definitions to be updated. - Changes that require minor updates to the server configuration that are backwards compatible, such as a new Django setting or environment variable. A _major_ release can inclue: @@ -37,7 +38,7 @@ A _major_ release can inclue: - Everything listed for minor releases - Removing functionality in the frontend - Backwards incompatible changes to routing in the frontend: URLs from older versions will no longer direct to the same content -- Backwards incompatible changes to corpus definitions: these require updating existing definitions +- Changes to the format for JSON corpus definitions that require updates to existing definitions. - Changes that require updating the server configuration in a way that is _not_ backwards compatible ## When to make a release @@ -56,4 +57,6 @@ Updating after _patch_ releases is always recommended and should be straightforw On our own servers, the deployment script (`deploy.py`) takes care of all these steps. -For _minor_ and _major_ releases, make sure to check the release notes to see if they require changes to the server configuration. Note that for major releases, these changes may require significant work or make your server incompatible with older versions of I-analyzer. +For _minor_ and _major_ releases, make sure to check the release notes to see if they require changes to the server configuration. If you are adding your own Python corpus definitions (that are not in the I-analyzer repository), check for updates in the format. + +Note that for major releases, these changes may require significant work or make your server incompatible with older versions of I-analyzer. From 0444142e3922b6c2d55d671c3d52add1867796ad Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 15 May 2024 16:34:26 +0200 Subject: [PATCH 66/94] rename corpus/edit -> corpus/definitions --- backend/addcorpus/json_corpora/tests/test_import.py | 10 +++++----- backend/addcorpus/serializers.py | 2 +- backend/addcorpus/tests/test_corpus_views.py | 6 +++--- backend/addcorpus/views.py | 6 +++--- backend/conftest.py | 4 ++-- backend/ianalyzer/urls.py | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 1004ff33e..47c98f122 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -1,13 +1,13 @@ from datetime import date from addcorpus.json_corpora.import_json import _parse_field from addcorpus.models import Field, Corpus -from addcorpus.serializers import CorpusEditSerializer +from addcorpus.serializers import CorpusJSONDefinitionSerializer from addcorpus.models import Corpus, CorpusConfiguration def test_json_corpus_import(db, json_corpus_data): Corpus.objects.all().delete() - serializer = CorpusEditSerializer(data=json_corpus_data) + serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data) assert serializer.is_valid() corpus = serializer.create(serializer.validated_data) @@ -39,7 +39,7 @@ def test_json_corpus_import(db, json_corpus_data): def test_serializer_representation(db, json_corpus_data): Corpus.objects.all().delete() - serializer = CorpusEditSerializer(data=json_corpus_data) + serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data) assert serializer.is_valid() corpus = serializer.create(serializer.validated_data) @@ -50,7 +50,7 @@ def test_serializer_representation(db, json_corpus_data): def test_serializer_update(db, json_corpus_data, json_mock_corpus: Corpus): # edit description json_corpus_data['meta']['description'] = 'A different description' - serializer = CorpusEditSerializer(data=json_corpus_data) + serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data) assert serializer.is_valid() serializer.update(json_mock_corpus, serializer.validated_data) corpus_config = CorpusConfiguration.objects.get(corpus=json_mock_corpus) @@ -59,7 +59,7 @@ def test_serializer_update(db, json_corpus_data, json_mock_corpus: Corpus): # remove a field assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 2 json_corpus_data['fields'] = json_corpus_data['fields'][:-1] - serializer = CorpusEditSerializer(data=json_corpus_data) + serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data) assert serializer.is_valid() serializer.update(json_mock_corpus, serializer.validated_data) assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 1 diff --git a/backend/addcorpus/serializers.py b/backend/addcorpus/serializers.py index ca1fae546..d6fb31db9 100644 --- a/backend/addcorpus/serializers.py +++ b/backend/addcorpus/serializers.py @@ -128,7 +128,7 @@ class Meta: fields = ['corpus_configuration', 'type', 'content'] -class CorpusEditSerializer(serializers.ModelSerializer): +class CorpusJSONDefinitionSerializer(serializers.ModelSerializer): class Meta: model = Corpus fields = '__all__' diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index 913b4e8f4..9e1c3fc99 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -87,17 +87,17 @@ def test_corpus_not_publication_ready(admin_client, basic_mock_corpus): def test_corpus_edit_views(admin_client: Client, json_corpus_data: Dict, json_mock_corpus: Corpus): json_mock_corpus.delete() - response = admin_client.get('/api/corpus/edit/') + response = admin_client.get('/api/corpus/definitions/') assert status.is_success(response.status_code) assert len(response.data) == 0 response = admin_client.post( - '/api/corpus/edit/', + '/api/corpus/definitions/', json_corpus_data, content_type='application/json', ) assert status.is_success(response.status_code) - response = admin_client.get('/api/corpus/edit/') + response = admin_client.get('/api/corpus/definitions/') assert status.is_success(response.status_code) assert len(response.data) == 1 diff --git a/backend/addcorpus/views.py b/backend/addcorpus/views.py index 5d70939ac..f495fd162 100644 --- a/backend/addcorpus/views.py +++ b/backend/addcorpus/views.py @@ -1,5 +1,5 @@ from rest_framework.views import APIView -from addcorpus.serializers import CorpusSerializer, CorpusDocumentationPageSerializer, CorpusEditSerializer +from addcorpus.serializers import CorpusSerializer, CorpusDocumentationPageSerializer, CorpusJSONDefinitionSerializer from rest_framework.response import Response from addcorpus.python_corpora.load_corpus import corpus_dir, load_corpus_definition import os @@ -88,9 +88,9 @@ def get(self, request, *args, **kwargs): return send_corpus_file(subdir='documents', **kwargs) -class CorpusEditViewset(viewsets.ModelViewSet): +class CorpusDefinitionViewset(viewsets.ModelViewSet): permission_classes = [IsAdminUser] - serializer_class = CorpusEditSerializer + serializer_class = CorpusJSONDefinitionSerializer def get_queryset(self): return Corpus.objects.filter(has_python_definition=False) diff --git a/backend/conftest.py b/backend/conftest.py index 602ce741b..a3cf09f33 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -14,7 +14,7 @@ from django.conf import settings from django.contrib.auth.models import Group from addcorpus.models import Corpus -from addcorpus.serializers import CorpusEditSerializer +from addcorpus.serializers import CorpusJSONDefinitionSerializer @pytest.fixture(autouse=True) def media_dir(tmpdir, settings): @@ -203,7 +203,7 @@ def json_corpus_data(): @pytest.fixture(autouse=True) def json_mock_corpus(db, json_corpus_data) -> Corpus: # add json mock corpora to the database at the start of each test - serializer = CorpusEditSerializer(data=json_corpus_data) + serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data) assert serializer.is_valid() corpus = serializer.create(serializer.validated_data) return corpus diff --git a/backend/ianalyzer/urls.py b/backend/ianalyzer/urls.py index 3faf7dd5a..67b181ea8 100644 --- a/backend/ianalyzer/urls.py +++ b/backend/ianalyzer/urls.py @@ -33,12 +33,12 @@ from media import urls as media_urls from tag import urls as tag_urls from tag.views import TagViewSet -from addcorpus.views import CorpusEditViewset +from addcorpus.views import CorpusDefinitionViewset api_router = routers.DefaultRouter() # register viewsets with this router api_router.register('search_history', QueryViewset, basename='query') api_router.register('tag/tags', TagViewSet) -api_router.register('corpus/edit', CorpusEditViewset, basename='corpus') +api_router.register('corpus/definitions', CorpusDefinitionViewset, basename='corpus') if settings.PROXY_FRONTEND: spa_url = re_path(r'^(?P.*)$', proxy_frontend) From e1136333092a4ad29699473dde00b00e22316ee8 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 16 May 2024 11:24:01 +0200 Subject: [PATCH 67/94] work in progress: Netherlands citation page --- .../parliament/citation/netherlands.md | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 backend/corpora/parliament/citation/netherlands.md diff --git a/backend/corpora/parliament/citation/netherlands.md b/backend/corpora/parliament/citation/netherlands.md new file mode 100644 index 000000000..dbacdd08a --- /dev/null +++ b/backend/corpora/parliament/citation/netherlands.md @@ -0,0 +1,34 @@ +## Citing the entire corpus + +People & Parliament presents the *Dutch parliamentary data* corpus, which is a combination of the following: +- Dutch parliamentary proceedings from 1814-2013, harvested and enriched in the [Political Mashup project](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-xk5-dw3s), retrieved 2020 +- Dutch parliamentary proceedings from 2014-2022, harvested and enriched by [ParlaMINT](https://www.clarin.eu/parlamint), first retrieved 2020 and updated 2023 + +### APA style + +> University of Jyväskylä and Utrecht University (2020, 2023). *Dutch Parliamentary data* [data set]. People & Parliament: the Netherlands. {{ frontend_url }}/search/parliament-netherlands + +### MLA style + +[MLA guidelines](https://style.mla.org/) recommend against citing a database, and recommend [citing each individual work you use](https://style.mla.org/separate-entries-database-works/). If you want to cite the entire corpus nonetheless, we recommend the following format: + +> University of Jyväskylä and Utrecht University. "Dutch Parliamentary data". *People & Parliament*, 2020/2023. {{ frontend_url }}/search/parliament-netherlands + +## Citing a specific speech + +When you cite a speech in the *Dutch Parliamentary data* corpus, newer documents contain a `Source URL` field. If this field is not present, we recommend that you retrieve a link by clicking the *link* icon underneath the speech's document tile. This should give you an url as follows: +https://people-and-parliament.hum.uu.nl/document/parliament-netherlands/ParlaMint-NL_2021-12-21-eerstekamer-4.u1 + +To get an URL for an entire debate, you can use the *view debate* link for a speech. This will get you a link like this: + + {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc + +This describes the query to view all chapters of the book on I-analyzer. + +### APA style + +> Porjeere, O. (1788). *Zanglievende uitspanningen*. Martinus de Bruijn. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc + +### MLA style + +> Porjeere, Olivier. *Zanglievende uitspanningen*. Martinus de Bruijn, 1788. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc From a9d23b09131a8c04f6b78b20c635808961be478c Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 16 May 2024 11:24:17 +0200 Subject: [PATCH 68/94] add document_context() to Netherlands corpus definition --- backend/corpora/parliament/netherlands.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py index f83c8287e..09a21dc5e 100644 --- a/backend/corpora/parliament/netherlands.py +++ b/backend/corpora/parliament/netherlands.py @@ -11,6 +11,7 @@ from corpora.parliament.utils.parlamint import extract_all_party_data, extract_people_data, extract_role_data, party_attribute_extractor, person_attribute_extractor from corpora.utils.formatting import format_page_numbers from corpora.parliament.parliament import Parliament +from corpora.utils.constants import document_context import corpora.parliament.utils.field_defaults as field_defaults import re @@ -132,11 +133,13 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition): es_index = getattr(settings, 'PP_NL_INDEX', 'parliament-netherlands') image = 'netherlands.jpg' description_page = 'netherlands.md' + citation_page = 'netherlands.md' tag_toplevel = lambda _, metadata: 'root' if is_old(metadata) else 'TEI' tag_entry = lambda _, metadata: 'speech' if is_old(metadata) else 'u' languages = ['nl'] category = 'parliament' + document_context = document_context() def sources(self, start, end): logger = logging.getLogger(__name__) From dd88f64101b811157951d6c1c9b6945d038e83dc Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 16 May 2024 12:00:40 +0200 Subject: [PATCH 69/94] update netherlands.md citation page --- .../corpora/parliament/citation/netherlands.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/backend/corpora/parliament/citation/netherlands.md b/backend/corpora/parliament/citation/netherlands.md index dbacdd08a..8e8a9d34d 100644 --- a/backend/corpora/parliament/citation/netherlands.md +++ b/backend/corpora/parliament/citation/netherlands.md @@ -14,21 +14,20 @@ People & Parliament presents the *Dutch parliamentary data* corpus, which is a c > University of Jyväskylä and Utrecht University. "Dutch Parliamentary data". *People & Parliament*, 2020/2023. {{ frontend_url }}/search/parliament-netherlands -## Citing a specific speech - -When you cite a speech in the *Dutch Parliamentary data* corpus, newer documents contain a `Source URL` field. If this field is not present, we recommend that you retrieve a link by clicking the *link* icon underneath the speech's document tile. This should give you an url as follows: -https://people-and-parliament.hum.uu.nl/document/parliament-netherlands/ParlaMint-NL_2021-12-21-eerstekamer-4.u1 - +## Referring to a debate To get an URL for an entire debate, you can use the *view debate* link for a speech. This will get you a link like this: - {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc + {{ frontend_url }}/search/parliament-netherlands?debate_id=ParlaMint-NL_2021-12-21-eerstekamer-4&sort=sequence,asc + +## Citing a specific speech -This describes the query to view all chapters of the book on I-analyzer. +To cite a speech in the *Dutch Parliamentary data* corpus, you can retrieve a link by clicking the *link* icon underneath the speech's document tile. This should give you an url as follows: +{{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-eerstekamer-4.u1 ### APA style -> Porjeere, O. (1788). *Zanglievende uitspanningen*. Martinus de Bruijn. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc +> Rutte, M. (2021). In *Report of the meeting of the Dutch Lower House, Meeting 37, Session 2 (2021-12-21)*. {{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-tweedekamer-2.u225 ### MLA style -> Porjeere, Olivier. *Zanglievende uitspanningen*. Martinus de Bruijn, 1788. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc +> Rutte, Mark. *Report of the meeting of the Dutch Lower House, Meeting 37, Session 2 (2021-12-21)*, 2021. {{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-tweedekamer-2.u225 From ce40ca380366ef5b91e97930cdb7ea1e1b6aea0c Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 16 May 2024 15:24:56 +0200 Subject: [PATCH 70/94] fix: remove duplicate word_model_path in Canada corpus definition --- backend/corpora/parliament/canada.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/corpora/parliament/canada.py b/backend/corpora/parliament/canada.py index cf16fa3d3..7ca60252c 100644 --- a/backend/corpora/parliament/canada.py +++ b/backend/corpora/parliament/canada.py @@ -26,7 +26,6 @@ class ParliamentCanada(Parliament, CSVCorpusDefinition): required_field = 'content' document_context = document_context(sort_field=None) - word_model_path = getattr(settings, 'PP_CA_WM', None) def sources(self, start, end): logger = logging.getLogger('indexing') From db1a53b131feb4e68cf29a49982c3ca0ce1dda67 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 16 May 2024 16:42:44 +0200 Subject: [PATCH 71/94] review suggestions Co-Authored-By: Jelte van Boheemen --- backend/addcorpus/python_corpora/corpus.py | 10 ++++++---- documentation/Corpus-database-models.md | 6 +++--- documentation/Corpus-definitions.md | 6 +++--- documentation/Indexing-corpora.md | 4 +++- documentation/Indexing-on-server.md | 2 +- documentation/Versioning.md | 2 +- documentation/Writing-a-corpus-definition-in-Python.md | 2 +- 7 files changed, 18 insertions(+), 14 deletions(-) diff --git a/backend/addcorpus/python_corpora/corpus.py b/backend/addcorpus/python_corpora/corpus.py index a82766ed5..2648e3b99 100644 --- a/backend/addcorpus/python_corpora/corpus.py +++ b/backend/addcorpus/python_corpora/corpus.py @@ -389,15 +389,17 @@ class FieldDefinition(Field): Should be `Filter` instance. extractor: Configuration to extract the field's data from source documents. Should be an `Extractor` instance. - sortable: Whether this field is shown as an option to sort search results. - searchable: Whether this field is shown in the selection for search fields. + sortable: Whether this field is shown as an option to sort search results. If + `None`, the value is inferred from the mapping type. + searchable: Whether this field is shown in the selection for search fields. If + `None`, the vlaue is inferred from the mapping type. downloadable: Whether this field may be included when downloading results. required: Whether this field is required during source extraction. Note that not all Reader subclasses currently support this. ''' def __init__(self, - name: str = None, + name: str, display_name: Optional[str] = None, display_type: Optional[str] = None, description: str = '', @@ -407,7 +409,7 @@ def __init__(self, csv_core: bool = False, search_field_core: bool = False, visualizations: List[str] = [], - visualization_sort: str = None, + visualization_sort: Optional[str] = None, es_mapping: Dict = {'type': 'text'}, language: Optional[str] = None, search_filter: Optional[Filter] = None, diff --git a/documentation/Corpus-database-models.md b/documentation/Corpus-database-models.md index fecb2b86d..1680c4a70 100644 --- a/documentation/Corpus-database-models.md +++ b/documentation/Corpus-database-models.md @@ -8,7 +8,7 @@ A full corpus definition is represented in four models: - `Corpus` - the main reference point for the corpus - `CorpusConfiguration` - has a one-to-one relationship with `Corpus` and represents all configured metadata -- `Field` - has a one-to-many relationship with `CorpusConfiguration` and represents a field in the corpus. +- `Field` - has a many-to-one relationship with `CorpusConfiguration` and represents a field in the corpus. - `CorpusDocumentationPage` - has a many-to-many relationship with `CorpusConfiguration` and represents documentation for users. These are defined in [/backend/addcorpus/models.py](/backend/addcorpus/models.py). @@ -21,7 +21,7 @@ On the other hand, the `Corpus` contains information about corpus access that is ## Importing Python corpora -Python definitions can be loaded into the database with the `loadcorpora` command in the backend. Normally, this is run when you start the server, so you do not need to run it manually. +Python definitions can be loaded into the database with the `loadcorpora` Django command in the backend. Normally, this is run when you start the server, so you do not need to run it manually. This command will parse any configured python corpora and save a database representation for them. If the python corpus cannot be loaded, the `Corpus` object will still exist in the database, but it will be inactive. @@ -31,7 +31,7 @@ If a corpus by the same name already exists in the database, the command will co Corpora have an `active` status that determines whether they are available for searching. In addition, you can configure the `groups` connected to a corpus, which determines who has access to it. A user will see a corpus if it is active and they are in a group that is given access. (A superuser implicitly has access to all active corpora.) -I-analyzer always includes a group named `'basic'`, which everyone is a member of by default, including anonymous users. So if you want a corpus to be public, add this group to it. +I-analyzer always includes a group named `'basic'`, which everyone is a member of by default, including anonymous users. If you want a corpus to be public, add this group to it. While a corpus is inactive, its validation is less strict. This allows you to build a database-only corpus in steps, and save an incomplete definition as a work in progress. See [Corpus validation](/documentation/Corpus-validation.md) for more details. diff --git a/documentation/Corpus-definitions.md b/documentation/Corpus-definitions.md index ec246792b..c4fab4ae8 100644 --- a/documentation/Corpus-definitions.md +++ b/documentation/Corpus-definitions.md @@ -2,7 +2,7 @@ Corpus definitions are the way that we configure each corpus in I-analyzer. -This documents gives a basic explanation of how corpus definitions "work" in the backend. It introduces the core concepts and mechanics. +This documents gives a basic explanation of how corpus definitions work in the backend. It introduces the core concepts and mechanics. ## Corpus definitions @@ -17,7 +17,7 @@ Note that a corpus definition does not include the actual data (i.e. documents), Corpora can be created in two ways: -- a **Python corpus** is defined in a Python module. Most data from this module is loaded into the databse, but the module also implements custom functions for complex functionality, such as data extraction. +- a **Python corpus** is defined in a Python module. Most data from this module is loaded into the database, but the module also implements custom functions for complex functionality, such as data extraction. - a **database-only corpus** is only represented in the database and does not use any custom Python functions. It offers less customisation, but is easier to create. > [!NOTE] @@ -31,7 +31,7 @@ These are the key differences between Python and database-only corpora. ### Data extraction -A Python corpus can theoretically extract data from any format. In practice, we rely on the [ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/) package which provides extraction utilities for common file types like CSV and XML, but the methods for extraction can as complex as you want. The design philosophy is that you can use the original format of a dataset as the source data for I-analyzer, without any pre-processing. +A Python corpus can theoretically extract data from any format. In practice, we rely on the [ianalyzer_readers](https://ianalyzer-readers.readthedocs.io/en/latest/) package which provides extraction utilities for common file types like CSV and XML, but the methods for extraction can be as complex as you want. The design philosophy is that you can use the original format of a dataset as the source data for I-analyzer, without any pre-processing. A database-only corpus only supports CSV extraction with very little room for customisation. Here, the idea is that you pre-process your data *before* you pass it on to I-analyzer. If it is convenient, you can use the `ianalyzer_readers` package to do so. diff --git a/documentation/Indexing-corpora.md b/documentation/Indexing-corpora.md index 56e019b1a..04fb71e33 100644 --- a/documentation/Indexing-corpora.md +++ b/documentation/Indexing-corpora.md @@ -1,6 +1,8 @@ # Indexing corpora -Indexing is the step to read the source data of the corpus and load it into elasticsearch, which makes the data available through the I-analyzer interface. +Indexing is the step to read the source data of the corpus and load it into elasticsearch. Elasticsearch creates an *index* of the data, which makes it available for efficient searching and aggregations. + +This step is necessary to make a dataset available in the I-analyzer interface. Note that indexing can take a significant amount of time (depending on the amount of data). You can start indexing once you have: - Created a definition for the corpus diff --git a/documentation/Indexing-on-server.md b/documentation/Indexing-on-server.md index a0c0b379c..53fb3d92b 100644 --- a/documentation/Indexing-on-server.md +++ b/documentation/Indexing-on-server.md @@ -10,7 +10,7 @@ On the server, move data to a location in the `/its` share. ### Database-only corpora -Add the corpus to the database of the indexing server. You can use the JSON export/import to do this easily. After importing the JSON representation, use the Django admin menu to add the data directory for the corpus (the path to the data on the `/its` share). +Add the corpus to the database of the indexing server. You can use the JSON export/import to do this easily. After importing the JSON representation, use the Django admin menu to specify the data directory for the corpus (the path to the data directory on the server). ### Python corpora diff --git a/documentation/Versioning.md b/documentation/Versioning.md index 9afbac3bd..bbac4ad79 100644 --- a/documentation/Versioning.md +++ b/documentation/Versioning.md @@ -33,7 +33,7 @@ A _minor_ release can include: - Changes to the format for Python corpus definitions, which may require older definitions to be updated. - Changes that require minor updates to the server configuration that are backwards compatible, such as a new Django setting or environment variable. -A _major_ release can inclue: +A _major_ release can include: - Everything listed for minor releases - Removing functionality in the frontend diff --git a/documentation/Writing-a-corpus-definition-in-Python.md b/documentation/Writing-a-corpus-definition-in-Python.md index 4ff0f3cbc..b3808bcbb 100644 --- a/documentation/Writing-a-corpus-definition-in-Python.md +++ b/documentation/Writing-a-corpus-definition-in-Python.md @@ -16,7 +16,7 @@ The steps of adding a new Python corpus are usually the following: ## Corpus definition -Start by adding a new Python module `corpusname.py` to the `backend/corpora` directory, and include in the `CORPORA` setting of your Django settings. (Use `settings_local.py` to set this for your own development server only.) +Start by adding a new Python module `.py` to the `backend/corpora` directory, and include in the `CORPORA` setting of your Django settings. (Use `settings_local.py` to set this for your own development server only.) The actual definition is a class that you define in this module. It should subclass the [`CorpusDefinition` class](/backend/addcorpus/python_corpora/corpus.py). This class includes some default values for attributes and default behaviour. From 15d488a67403ec7edcbd1f5d4b7e0d01a8119cbd Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 16 May 2024 17:17:46 +0200 Subject: [PATCH 72/94] code quality --- .../multiple-choice-filter.component.ts | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts index 2db9172ca..43ebbb97e 100644 --- a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts +++ b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts @@ -5,7 +5,7 @@ import * as _ from 'lodash'; import { BaseFilterComponent } from '../base-filter.component'; import { MultipleChoiceFilter, MultipleChoiceFilterOptions } from '../../models'; import { SearchService } from '../../services'; -import { TermsAggregator } from '../../models/aggregation'; +import { TermsAggregator, TermsResult } from '../../models/aggregation'; @Component({ selector: 'ia-multiple-choice-filter', @@ -33,12 +33,17 @@ export class MultipleChoiceFilterComponent extends BaseFilterComponent - this.options = _.sortBy( - result.map(x => ({ label: x.key, value: x.key, doc_count: x.doc_count })), - o => o.label - ) - ).catch(() => this.options = []); + + const parseOption = (item: TermsResult) => ({ + label: item.key, value: item.key, doc_count: item.doc_count + }); + this.searchService.aggregateSearch( + queryModel.corpus, queryModel, aggregator + ).then(result => + this.options = _.sortBy(result.map(parseOption), option => option.label) + ).catch(() => + this.options = [] + ); } } } From f97116e92c3fe108f2ab32e0f9355e077354f6da Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 17 May 2024 12:27:49 +0200 Subject: [PATCH 73/94] validate data dir when indexing db corpus --- .../json_corpora/tests/test_import.py | 1 - backend/addcorpus/models.py | 17 +++++++++++---- backend/addcorpus/reader.py | 8 +++++-- backend/addcorpus/tests/test_reader.py | 9 +++++++- backend/addcorpus/validation/indexing.py | 21 +++++++++++++++++++ backend/es/es_index.py | 4 +++- backend/es/management/commands/index.py | 5 +---- 7 files changed, 52 insertions(+), 13 deletions(-) diff --git a/backend/addcorpus/json_corpora/tests/test_import.py b/backend/addcorpus/json_corpora/tests/test_import.py index 47c98f122..b620ea749 100644 --- a/backend/addcorpus/json_corpora/tests/test_import.py +++ b/backend/addcorpus/json_corpora/tests/test_import.py @@ -12,7 +12,6 @@ def test_json_corpus_import(db, json_corpus_data): corpus = serializer.create(serializer.validated_data) assert corpus.name == 'example' - assert corpus.ready_to_index() config = corpus.configuration diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py index 01aa33535..ca31d9912 100644 --- a/backend/addcorpus/models.py +++ b/backend/addcorpus/models.py @@ -11,10 +11,9 @@ validate_source_data_directory, ) from addcorpus.validation.indexing import (validate_essential_fields, - validate_has_configuration, - validate_language_field) + validate_has_configuration, validate_language_field, validate_has_data_directory) from addcorpus.validation.publishing import (validate_default_sort, - validate_ngram_has_date_field) + validate_ngram_has_date_field) from django.contrib import admin from django.contrib.auth.models import Group from django.contrib.postgres.fields import ArrayField @@ -92,6 +91,7 @@ def validate_ready_to_index(self) -> None: config = self.configuration_obj fields = config.fields.all() + validate_has_data_directory(self) validate_essential_fields(fields) validate_language_field(self) @@ -111,12 +111,21 @@ def validate_ready_to_publish(self) -> None: ''' Validation that should be carried out before making the corpus public. + This also includes most checks that are needed to create an index, but not all + (if the index already exists, you do not need source data). + Raises: CorpusNotIndexableError: the corpus is not meeting requirements for indexing. CorpusNotPublishableError: interface options are improperly configured. ''' - self.validate_ready_to_index() + validate_has_configuration(self) + + config = self.configuration_obj + fields = config.fields.all() + + validate_essential_fields(fields) + validate_language_field(self) validate_ngram_has_date_field(self) validate_default_sort(self) diff --git a/backend/addcorpus/reader.py b/backend/addcorpus/reader.py index aaa9f9288..2566f6fa6 100644 --- a/backend/addcorpus/reader.py +++ b/backend/addcorpus/reader.py @@ -1,12 +1,14 @@ import glob -from addcorpus.models import Corpus, Field -from addcorpus.python_corpora.load_corpus import load_corpus_definition from ianalyzer_readers.extract import CSV from ianalyzer_readers.readers.core import Field as ReaderField from ianalyzer_readers.readers.core import Reader from ianalyzer_readers.readers.csv import CSVReader +from addcorpus.models import Corpus, Field +from addcorpus.python_corpora.load_corpus import load_corpus_definition +from addcorpus.validation.indexing import validate_has_data_directory + def make_reader_field(corpus_field: Field) -> ReaderField: return ReaderField( @@ -25,6 +27,8 @@ def make_reader(corpus: Corpus) -> Reader: if corpus.has_python_definition: return load_corpus_definition(corpus.name) + validate_has_data_directory(corpus) + class NewReader(CSVReader): data_directory = corpus.configuration.data_directory delimiter = corpus.configuration.source_data_delimiter diff --git a/backend/addcorpus/tests/test_reader.py b/backend/addcorpus/tests/test_reader.py index 05775ec19..43aefa2d2 100644 --- a/backend/addcorpus/tests/test_reader.py +++ b/backend/addcorpus/tests/test_reader.py @@ -1,8 +1,10 @@ import os from django.conf import settings +import pytest + from addcorpus.models import Corpus from addcorpus.reader import make_reader - +from addcorpus.validation.indexing import CorpusNotIndexableError def test_make_reader_python(basic_mock_corpus): @@ -29,3 +31,8 @@ def test_make_reader_json(json_mock_corpus): 'character': 'HAMLET', 'line': "Whither wilt thou lead me? Speak, I\'ll go no further." } + + +def test_reader_validates_directory(json_mock_corpus): + with pytest.raises(CorpusNotIndexableError): + reader = make_reader(json_mock_corpus) diff --git a/backend/addcorpus/validation/indexing.py b/backend/addcorpus/validation/indexing.py index 4f879ea21..a3763ef7b 100644 --- a/backend/addcorpus/validation/indexing.py +++ b/backend/addcorpus/validation/indexing.py @@ -3,6 +3,7 @@ ''' import warnings +import os from addcorpus.validation.creation import primary_mapping_type @@ -75,3 +76,23 @@ def validate_language_field(corpus): 'Cannot use "dynamic" language for fields without configuring a ' 'field_language for the corpus' ) + +def validate_has_data_directory(corpus): + ''' + If the corpus does not have a Python definition, validate that it has a data + directory. + ''' + + if corpus.has_python_definition: + return + + config = corpus.configuration + if not config.data_directory: + raise CorpusNotIndexableError( + 'Missing data directory' + ) + + if not os.path.isdir(config.data_directory): + raise CorpusNotIndexableError( + 'Configured data directory does not exist.' + ) diff --git a/backend/es/es_index.py b/backend/es/es_index.py index d7d6abbb9..23947c377 100644 --- a/backend/es/es_index.py +++ b/backend/es/es_index.py @@ -149,8 +149,10 @@ def perform_indexing( add: bool = False, clear: bool = False, prod: bool = False, - rollover: bool = False + rollover: bool = False, ): + corpus.validate_ready_to_index() + corpus_config = corpus.configuration corpus_name = corpus.name index_name = corpus_config.es_index diff --git a/backend/es/management/commands/index.py b/backend/es/management/commands/index.py index 36a965638..3644bfb05 100644 --- a/backend/es/management/commands/index.py +++ b/backend/es/management/commands/index.py @@ -74,7 +74,7 @@ def add_arguments(self, parser): def handle(self, corpus, start=None, end=None, add=False, delete=False, update=False, mappings_only=False, prod=False, rollover=False, **options): corpus_object = self._corpus_object(corpus) - self._validate(corpus_object) + corpus_object.validate_ready_to_index() corpus_definition = load_corpus_definition(corpus) @@ -123,6 +123,3 @@ def handle(self, corpus, start=None, end=None, add=False, delete=False, update=F def _corpus_object(self, corpus_name): load_all_corpus_definitions() return Corpus.objects.get(name=corpus_name) - - def _validate(self, corpus_obj): - corpus_obj.validate_ready_to_index() From c5d7f89f37aa39d51bc45d5726fadf96170abebe Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 17 May 2024 12:37:38 +0200 Subject: [PATCH 74/94] add data directory to json mock corpus --- backend/conftest.py | 13 +++++++++++-- backend/es/tests/test_es_index.py | 6 +----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/backend/conftest.py b/backend/conftest.py index 02d89ae19..2e76c7939 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -155,8 +155,7 @@ def _index_test_corpus(es_client: Elasticsearch, corpus_name: str): corpus = Corpus.objects.get(name=corpus_name) if not es_client.indices.exists(index=corpus.configuration.es_index): - index.create(es_client, corpus, clear=True) - index.populate(es_client, corpus) + index.perform_indexing(corpus) # ES is "near real time", so give it a second before we start searching the index sleep(2) @@ -185,6 +184,11 @@ def index_tag_mock_corpus(db, es_client: Elasticsearch, tag_mock_corpus: str, te _index_test_corpus(es_client, tag_mock_corpus) +@pytest.fixture() +def index_json_mock_corpus(db, es_client: Elasticsearch, json_mock_corpus: Corpus, test_index_cleanup): + _index_test_corpus(es_client, json_mock_corpus.name) + + # mock corpora @pytest.fixture(autouse=True) def add_mock_python_corpora_to_db(db, media_dir): @@ -205,4 +209,9 @@ def json_mock_corpus(db, json_corpus_data) -> Corpus: serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data) assert serializer.is_valid() corpus = serializer.create(serializer.validated_data) + + data_dir = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'source_data') + corpus.configuration.data_directory = data_dir + corpus.configuration.save() + return corpus diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py index 970cd91a3..71ff4585f 100644 --- a/backend/es/tests/test_es_index.py +++ b/backend/es/tests/test_es_index.py @@ -63,10 +63,6 @@ def test_mismatch_corpus_index_names(mock_corpus, corpus_definition, es_index_cl assert corpus_definition.es_index != mock_corpus -def test_db_only_corpus(json_mock_corpus, es_client, test_index_cleanup): - perform_indexing( - corpus=json_mock_corpus, - ) - sleep(2) +def test_db_only_corpus(json_mock_corpus, es_client, index_json_mock_corpus): res = es_client.count(index=json_mock_corpus.configuration.es_index) assert res.get('count') == 10 From bb08f9b044a18bd6f6564179568183036a139ff0 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 20 May 2024 12:07:56 +0200 Subject: [PATCH 75/94] fix reader validation check --- backend/addcorpus/tests/test_reader.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backend/addcorpus/tests/test_reader.py b/backend/addcorpus/tests/test_reader.py index 43aefa2d2..36a9e6ab7 100644 --- a/backend/addcorpus/tests/test_reader.py +++ b/backend/addcorpus/tests/test_reader.py @@ -33,6 +33,12 @@ def test_make_reader_json(json_mock_corpus): } -def test_reader_validates_directory(json_mock_corpus): +def test_reader_validates_directory(json_mock_corpus: Corpus): + # should run without error + make_reader(json_mock_corpus) + + json_mock_corpus.configuration.data_directory = '' + json_mock_corpus.configuration.save() + with pytest.raises(CorpusNotIndexableError): reader = make_reader(json_mock_corpus) From e4f5aa2d49a361f537187c374548b5987063afae Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 20 May 2024 12:25:06 +0200 Subject: [PATCH 76/94] enable sorting by date in guardianobserver --- backend/corpora/guardianobserver/guardianobserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py index c3bea0649..54737e274 100644 --- a/backend/corpora/guardianobserver/guardianobserver.py +++ b/backend/corpora/guardianobserver/guardianobserver.py @@ -86,7 +86,8 @@ def sources(self, start=datetime.min, end=datetime.max): extractor=extract.XML( tag='NumericPubDate', toplevel=True, transform=lambda x: '{y}-{m}-{d}'.format(y=x[:4],m=x[4:6],d=x[6:]) - ) + ), + sortable=True, ), FieldDefinition( name='date-pub', From 4880b9a9c8e41df13f4cdb6d297944c456b9424e Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 21 May 2024 21:34:07 +0200 Subject: [PATCH 77/94] fix markdown syntax --- documentation/Writing-a-corpus-definition-in-Python.md | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/Writing-a-corpus-definition-in-Python.md b/documentation/Writing-a-corpus-definition-in-Python.md index b3808bcbb..6e3b1984f 100644 --- a/documentation/Writing-a-corpus-definition-in-Python.md +++ b/documentation/Writing-a-corpus-definition-in-Python.md @@ -60,6 +60,7 @@ The corpus class must define a method `sources(self, **kwargs)`. See the [API do ### Optional attributes | Attribute | Type | Description | +|-----------|------|-------------| | `image` | `str` | The filename of the image used for the corpus in the interface. (See below.) | | `es_alias` | `str` | An alias that you want to assign to the index in elasticsearch. | | `es_settings` | `Dict` | Customises the settings of the elasticsearch index. Can be generated using [es_settings.py](../backend/addcorpus/es_settings.py) | From e8c652c772a7b6b0467e4f91c68c9803e2566227 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 22 May 2024 15:41:51 +0200 Subject: [PATCH 78/94] add Chicago reference recommendation --- backend/corpora/dbnl/citation/citation.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/corpora/dbnl/citation/citation.md b/backend/corpora/dbnl/citation/citation.md index 47723dfd1..3207affd2 100644 --- a/backend/corpora/dbnl/citation/citation.md +++ b/backend/corpora/dbnl/citation/citation.md @@ -27,6 +27,10 @@ To get an URL for an entire book, you can use the *view book* link for a chapter This describes the query to view all chapters of the book on I-analyzer. +### Chicago style + +Mark Rutte, Dutch Lower House, 21 December 2021, page?, link. + ### APA style > Porjeere, O. (1788). *Zanglievende uitspanningen*. Martinus de Bruijn. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc From 12794814464773dfaff7c5f93fb6ee2844039462 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 23 May 2024 09:32:34 +0200 Subject: [PATCH 79/94] reorganize and syntax check actions --- .github/workflows/backend-test.yml | 26 +++++++++++++++++++++++ .github/workflows/frontend-test.yml | 26 +++++++++++++++++++++++ .github/workflows/release.yaml | 22 -------------------- .github/workflows/release.yml | 25 ++++++++++++++++++++++ .github/workflows/test.yml | 32 ----------------------------- 5 files changed, 77 insertions(+), 54 deletions(-) create mode 100644 .github/workflows/backend-test.yml create mode 100644 .github/workflows/frontend-test.yml delete mode 100644 .github/workflows/release.yaml create mode 100644 .github/workflows/release.yml delete mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml new file mode 100644 index 000000000..b787ff68c --- /dev/null +++ b/.github/workflows/backend-test.yml @@ -0,0 +1,26 @@ +# This workflow will run backend tests on the Python version defined in the Dockerfiles + +name: Backend unit tests + +on: + workflow_dispatch: + push: + branches: + - 'develop' + - 'master' + - 'feature/**' + - 'bugfix/**' + - 'hotfix/**' + - 'release/**' + - 'dependabot/**' + paths: + - 'backend/**' + +jobs: + backend-test: + name: Test Backend + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run backend tests + run: sudo mkdir -p /ci-data && sudo docker-compose --env-file .env-ci run backend pytest diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml new file mode 100644 index 000000000..f2fd8fed2 --- /dev/null +++ b/.github/workflows/frontend-test.yml @@ -0,0 +1,26 @@ +# This workflow will run frontend tests on the Node version defined in the Dockerfiles + +name: Frontend unit tests + +on: + workflow_dispatch: + push: + branches: + - 'develop' + - 'master' + - 'feature/**' + - 'bugfix/**' + - 'hotfix/**' + - 'release/**' + - 'dependabot/**' + paths: + - 'frontend/**' + +jobs: + frontend-test: + name: Test Frontend + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run frontend tests + run: sudo docker-compose --env-file .env-ci run frontend yarn test diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index 1bd79f24e..000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Release - -on: - workflow_dispatch: - push: - branches: - - 'release/**' - - 'hotfix/**' - jobs: - citation-update: - uses: actions/checkout@v3 - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Update CITATION.cff - - run: | - version=`grep -o '\d\+\.\d\+\.\d\+' package.json` - today=`date +"%Y-%m-%d"` - sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff - sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff - git commit -a -m "update version and date in CITATION.cff" - diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..ed2bc732e --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,25 @@ +# This action will update the CITATION.cff file for new release or hotfix branches + +name: Release + +on: + push: + branches: + - 'release/**' + - 'hotfix/**' + +jobs: + citation-update: + name: Update CITATION.cff + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Autoformat CITATION.cff + run: | + version=`grep -o '\d\+\.\d\+\.\d\+' package.json` + today=`date +"%Y-%m-%d"` + sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff + sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff + bash ./update-citation.sh + git commit -a -m "update version and date in CITATION.cff" + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index e465ce012..000000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,32 +0,0 @@ -# This workflow will run tests on the Python and Node versions defined in the Dockerfiles -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Unit tests - -on: - workflow_dispatch: - push: - branches: - - 'develop' - - 'master' - - 'feature/**' - - 'bugfix/**' - - 'hotfix/**' - - 'release/**' - - 'dependabot/**' - -jobs: - backend-test: - name: Test Backend - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Run backend tests - run: sudo mkdir -p /ci-data && sudo docker-compose --env-file .env-ci run backend pytest - frontend-test: - name: Test Frontend - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Run frontend tests - run: sudo docker-compose --env-file .env-ci run frontend yarn test From 011b0ab50d00407b8431472056201cf2f689d787 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 23 May 2024 10:34:38 +0200 Subject: [PATCH 80/94] add annotated-text-plugin to Docker --- DockerfileElastic | 3 +++ docker-compose.yaml | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 DockerfileElastic diff --git a/DockerfileElastic b/DockerfileElastic new file mode 100644 index 000000000..2b908c5fb --- /dev/null +++ b/DockerfileElastic @@ -0,0 +1,3 @@ +FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2 + +RUN bin/elasticsearch-plugin install mapper-annotated-text diff --git a/docker-compose.yaml b/docker-compose.yaml index 9277df751..f3f4d63b3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -52,7 +52,9 @@ services: target: /frontend/build command: sh -c "yarn prebuild && yarn start-docker" elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.10.2 + build: + context: . + dockerfile: DockerfileElastic environment: - node.name=ianalyzer-node - discovery.type=single-node From feb3c8530bc609d79774c08c12760ed8ac3a67a8 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 28 May 2024 11:52:07 +0200 Subject: [PATCH 81/94] clean up corpus model remove obsolete properties change properties to camel case --- .../corpus-header/corpus-header.component.ts | 2 +- .../document-view.component.spec.ts | 2 +- .../document-view/document-view.component.ts | 2 +- .../app/image-view/image-view.component.ts | 4 ++-- frontend/src/app/models/corpus.ts | 19 +++++---------- frontend/src/app/models/query.spec.ts | 7 +++--- .../src/app/services/corpus.service.spec.ts | 2 +- frontend/src/app/services/corpus.service.ts | 3 --- frontend/src/app/utils/es-query.ts | 2 +- frontend/src/mock-data/corpus.ts | 24 +++++++------------ 10 files changed, 25 insertions(+), 42 deletions(-) diff --git a/frontend/src/app/corpus-header/corpus-header.component.ts b/frontend/src/app/corpus-header/corpus-header.component.ts index 95baaaac7..9abffd5fc 100644 --- a/frontend/src/app/corpus-header/corpus-header.component.ts +++ b/frontend/src/app/corpus-header/corpus-header.component.ts @@ -24,7 +24,7 @@ export class CorpusHeaderComponent implements OnChanges, OnInit { ngOnChanges(changes: SimpleChanges): void { if (this.corpus) { - this.wordModelsPresent = this.corpus.word_models_present; + this.wordModelsPresent = this.corpus.wordModelsPresent; } } } diff --git a/frontend/src/app/document-view/document-view.component.spec.ts b/frontend/src/app/document-view/document-view.component.spec.ts index 6fa470a3a..f59858f51 100644 --- a/frontend/src/app/document-view/document-view.component.spec.ts +++ b/frontend/src/app/document-view/document-view.component.spec.ts @@ -20,7 +20,7 @@ describe('DocumentViewComponent', () => { fixture = TestBed.createComponent(DocumentViewComponent); component = fixture.componentInstance; component.corpus = _.merge({ - scan_image_type: 'farout_image_type', + scanImageType: 'farout_image_type', fields: [mockField] }, mockCorpus); component.document = makeDocument({ great_field: 'Hello world!' }); diff --git a/frontend/src/app/document-view/document-view.component.ts b/frontend/src/app/document-view/document-view.component.ts index ee8c453e8..81a35aa27 100644 --- a/frontend/src/app/document-view/document-view.component.ts +++ b/frontend/src/app/document-view/document-view.component.ts @@ -46,7 +46,7 @@ export class DocumentViewComponent implements OnChanges { } get showScanTab() { - return !!this.corpus.scan_image_type; + return !!this.corpus.scanImageType; } ngOnChanges(changes: SimpleChanges): void { diff --git a/frontend/src/app/image-view/image-view.component.ts b/frontend/src/app/image-view/image-view.component.ts index bf93934d2..cc3dd32c1 100644 --- a/frontend/src/app/image-view/image-view.component.ts +++ b/frontend/src/app/image-view/image-view.component.ts @@ -43,8 +43,8 @@ export class ImageViewComponent implements OnChanges { ngOnChanges(changes: SimpleChanges) { if (changes.corpus) { - this.allowDownload = this.corpus.allow_image_download; - this.mediaType = this.corpus.scan_image_type; + this.allowDownload = this.corpus.allowImageDownload; + this.mediaType = this.corpus.scanImageType; } if ( changes.document && diff --git a/frontend/src/app/models/corpus.ts b/frontend/src/app/models/corpus.ts index 04d71cae1..4906e2966 100644 --- a/frontend/src/app/models/corpus.ts +++ b/frontend/src/app/models/corpus.ts @@ -6,9 +6,8 @@ import { Store } from '../store/types'; import { SimpleStore } from '../store/simple-store'; // corresponds to the corpus definition on the backend. -export class Corpus implements ElasticSearchIndex { +export class Corpus { constructor( - public serverName, /** * Internal name for referring to this corpus e.g. in URLs. */ @@ -25,15 +24,13 @@ export class Corpus implements ElasticSearchIndex { public fields: CorpusField[], public minDate: Date, public maxDate: Date, - public scan_image_type: string, - public allow_image_download: boolean, - public word_models_present: boolean, + public scanImageType: string, + public allowImageDownload: boolean, + public wordModelsPresent: boolean, public languages: string[], public category: string, - public descriptionpage?: string, - public citationPage?: string, public documentContext?: DocumentContext, - public new_highlight?: boolean, + public newHighlight?: boolean, public defaultSort?: SortState, public languageField?: CorpusField, ) { } @@ -47,14 +44,10 @@ export class Corpus implements ElasticSearchIndex { } get displayLanguages(): string { - return this.languages.join(', '); // may have to truncate long lists? + return this.languages.join(', '); } } -export interface ElasticSearchIndex { - index: string; - serverName: string; -} export interface DocumentContext { contextFields: CorpusField[]; diff --git a/frontend/src/app/models/query.spec.ts b/frontend/src/app/models/query.spec.ts index 14b95ae60..9fb6ebd61 100644 --- a/frontend/src/app/models/query.spec.ts +++ b/frontend/src/app/models/query.spec.ts @@ -9,16 +9,15 @@ import { SimpleStore } from '../store/simple-store'; const corpus: Corpus = { name: 'mock-corpus', title: 'Mock Corpus', - serverName: 'default', description: '', index: 'mock-corpus', minDate: new Date('1800-01-01'), minYear: 1800, maxDate: new Date('1900-01-01'), maxYear: 1900, - scan_image_type: null, - allow_image_download: true, - word_models_present: false, + scanImageType: null, + allowImageDownload: true, + wordModelsPresent: false, fields: [ mockField2, mockFieldDate, diff --git a/frontend/src/app/services/corpus.service.spec.ts b/frontend/src/app/services/corpus.service.spec.ts index 6a2946cf4..bf07f0d26 100644 --- a/frontend/src/app/services/corpus.service.spec.ts +++ b/frontend/src/app/services/corpus.service.spec.ts @@ -200,7 +200,7 @@ describe('CorpusService', () => { expect(items.length).toBe(1); const corpus = _.first(items); - expect(corpus.scan_image_type).toBe('png'); + expect(corpus.scanImageType).toBe('png'); const fieldData = [ { diff --git a/frontend/src/app/services/corpus.service.ts b/frontend/src/app/services/corpus.service.ts index 47f7f4cf4..7092bd591 100644 --- a/frontend/src/app/services/corpus.service.ts +++ b/frontend/src/app/services/corpus.service.ts @@ -77,7 +77,6 @@ export class CorpusService { private parseCorpusItem = (data: any): Corpus => { const allFields: CorpusField[] = data.fields.map(this.parseField); return new Corpus( - data.server_name, data.name, data.title, data.description, @@ -90,8 +89,6 @@ export class CorpusService { data.word_models_present, data.languages, data.category, - data.description_page, - data.citation_page, this.parseDocumentContext(data.document_context, allFields), data.new_highlight, this.parseDefaultSort(data.default_sort, allFields), diff --git a/frontend/src/app/utils/es-query.ts b/frontend/src/app/utils/es-query.ts index e72208bc8..5368470bb 100644 --- a/frontend/src/app/utils/es-query.ts +++ b/frontend/src/app/utils/es-query.ts @@ -97,7 +97,7 @@ export const makeHighlightSpecification = (corpus: Corpus, queryText?: string, h field.positionsOffsets && // add matched_fields for stemmed highlighting // ({ [field.name]: {"type": "fvh", "matched_fields": ["speech", "speech.stemmed"] }}): - corpus.new_highlight + corpus.newHighlight ? { [field.name]: { type: 'fvh', diff --git a/frontend/src/mock-data/corpus.ts b/frontend/src/mock-data/corpus.ts index 0aaa53230..f1472315d 100644 --- a/frontend/src/mock-data/corpus.ts +++ b/frontend/src/mock-data/corpus.ts @@ -137,16 +137,14 @@ export const mockFieldDate = new CorpusField({ export const mockCorpus: Corpus = { name: 'test1', - serverName: 'default', index: 'test1', title: 'Test corpus', description: 'This corpus is for mocking', minDate: new Date('1800-01-01'), maxDate: new Date('1900-01-01'), - image: 'test.jpg', - scan_image_type: 'pdf', - allow_image_download: false, - word_models_present: false, + scanImageType: 'pdf', + allowImageDownload: false, + wordModelsPresent: false, directDownloadLimit: 500, fields: [mockField, mockField2], languages: ['English'], @@ -157,16 +155,14 @@ export const mockCorpus: Corpus = { export const mockCorpus2 = { name: 'test2', - serverName: 'default', index: 'test2', title: 'Test corpus 2', description: 'This corpus is for mocking', minDate: new Date('1850-01-01'), maxDate: new Date('2000-01-01'), - image: 'test.jpg', - scan_image_type: 'pdf', - allow_image_download: false, - word_models_present: false, + scanImageType: 'pdf', + allowImageDownload: false, + wordModelsPresent: false, directDownloadLimit: 1000, fields: [mockField2], languages: ['English', 'French'], @@ -177,16 +173,14 @@ export const mockCorpus2 = { export const mockCorpus3: Corpus = { name: 'test3', - serverName: 'default', index: 'test3', title: 'Test corpus 3', description: 'This corpus is for mocking', minDate: new Date(), maxDate: new Date(), - image: 'test.jpg', - scan_image_type: 'pdf', - allow_image_download: false, - word_models_present: false, + scanImageType: 'pdf', + allowImageDownload: false, + wordModelsPresent: false, directDownloadLimit: 2000, fields: [mockField, mockField2, mockField3, mockFieldDate, mockFieldMultipleChoice], languages: ['English'], From b15c100747ae8fbdd0b85b3ab93e5cde9a3b8f68 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 28 May 2024 17:35:07 +0200 Subject: [PATCH 82/94] Update README.md Remove account requirement in usage instructions. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c4259bf85..8fd6080d8 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ For corpora included in I-analyzer, the backend includes a definition file that ## Usage -If you are interested in using I-analyzer, the most straightforward way to get started is to make an account at [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament (not publicly accessible)](https://people-and-parliament.hum.uu.nl/). +If you are interested in using I-analyzer, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/). I-analyzer does not have an "upload data" option (yet!). If you are interested in using I-analyzer as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways: From afd8a48e2df61261902685319da0cd0382c54166 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 14:58:02 +0200 Subject: [PATCH 83/94] add Chicago citation style --- backend/corpora/dbnl/citation/citation.md | 15 +++++++++++---- .../corpora/parliament/citation/netherlands.md | 15 +++++++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/backend/corpora/dbnl/citation/citation.md b/backend/corpora/dbnl/citation/citation.md index 3207affd2..f8e947d3a 100644 --- a/backend/corpora/dbnl/citation/citation.md +++ b/backend/corpora/dbnl/citation/citation.md @@ -12,6 +12,9 @@ I-analyzer presents the [DBNL-dataset](https://www.kb.nl/onderzoeken-vinden/data > KB, Nationale Biliotheek. "DBNL-dataset". *I-analyzer*, 2023, {{ frontend_url }}/search/dbnl +### Chicago "notes and bibliography" style +> KB, Nationale Bibliotheek, "DBNL-dataset", distributed by I-analyzer, 2023. {{ frontend_url }}/search/dbnl. + ## Citing a specific work @@ -27,10 +30,6 @@ To get an URL for an entire book, you can use the *view book* link for a chapter This describes the query to view all chapters of the book on I-analyzer. -### Chicago style - -Mark Rutte, Dutch Lower House, 21 December 2021, page?, link. - ### APA style > Porjeere, O. (1788). *Zanglievende uitspanningen*. Martinus de Bruijn. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc @@ -38,3 +37,11 @@ Mark Rutte, Dutch Lower House, 21 December 2021, page?, link. ### MLA style > Porjeere, Olivier. *Zanglievende uitspanningen*. Martinus de Bruijn, 1788. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc + +### Chicago "notes and bibliography" style +#### First note +> Olivier Porjeere, *Zanglievende uitspanningen* (Alkmaar: Martinus de Bruijn, 1788) {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc. +#### Shortened note +> Porjeere, *Zanglievende uitspanningen* +#### Bibliography entry +> Porjeere, Olivier. *Zanglievende uitspanningen*. Alkmaar: Martinus de Bruijn, 1788. {{ frontend_url }}/search/dbnl?title_id=porj001zang01_01&sort=chapter_index,asc. \ No newline at end of file diff --git a/backend/corpora/parliament/citation/netherlands.md b/backend/corpora/parliament/citation/netherlands.md index 8e8a9d34d..d85763c42 100644 --- a/backend/corpora/parliament/citation/netherlands.md +++ b/backend/corpora/parliament/citation/netherlands.md @@ -4,15 +4,18 @@ People & Parliament presents the *Dutch parliamentary data* corpus, which is a c - Dutch parliamentary proceedings from 1814-2013, harvested and enriched in the [Political Mashup project](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-xk5-dw3s), retrieved 2020 - Dutch parliamentary proceedings from 2014-2022, harvested and enriched by [ParlaMINT](https://www.clarin.eu/parlamint), first retrieved 2020 and updated 2023 +### Chicago "notes and bibliography" style +> University of Jyväskylä and Utrecht University, "Dutch Parliamentary data", distributed by People & Parliament, 2023. {{ frontend_url }}/search/parliament-netherlands. + ### APA style -> University of Jyväskylä and Utrecht University (2020, 2023). *Dutch Parliamentary data* [data set]. People & Parliament: the Netherlands. {{ frontend_url }}/search/parliament-netherlands +> University of Jyväskylä and Utrecht University (2023). *Dutch Parliamentary data* [data set]. People & Parliament. {{ frontend_url }}/search/parliament-netherlands ### MLA style [MLA guidelines](https://style.mla.org/) recommend against citing a database, and recommend [citing each individual work you use](https://style.mla.org/separate-entries-database-works/). If you want to cite the entire corpus nonetheless, we recommend the following format: -> University of Jyväskylä and Utrecht University. "Dutch Parliamentary data". *People & Parliament*, 2020/2023. {{ frontend_url }}/search/parliament-netherlands +> University of Jyväskylä and Utrecht University. "Dutch Parliamentary data". People & Parliament, 2023. {{ frontend_url }}/search/parliament-netherlands ## Referring to a debate To get an URL for an entire debate, you can use the *view debate* link for a speech. This will get you a link like this: @@ -24,6 +27,14 @@ To get an URL for an entire debate, you can use the *view debate* link for a spe To cite a speech in the *Dutch Parliamentary data* corpus, you can retrieve a link by clicking the *link* icon underneath the speech's document tile. This should give you an url as follows: {{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-eerstekamer-4.u1 +### Chicago "notes and bibliography" style +#### First note +> Mark Rutte in *Report of the meeting of the Dutch Lower House, Meeting 37, Session 2 (2021-12-21)*, 2021. {{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-tweedekamer-2.u225. +#### Shortened note +> Rutte, *Meeting 37, Session 2 (2021-12-21)* +#### Bibliography entry +> Rutte, Mark. In *Report of the meeting of the Dutch Lower House, Meeting 37, Session 2 (2021-12-21)*, 2021. {{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-tweedekamer-2.u225. + ### APA style > Rutte, M. (2021). In *Report of the meeting of the Dutch Lower House, Meeting 37, Session 2 (2021-12-21)*. {{ frontend_url }}/document/parliament-netherlands/ParlaMint-NL_2021-12-21-tweedekamer-2.u225 From c2d4585f339fd1101ba7f5b8f646e12a93850511 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:01:00 +0200 Subject: [PATCH 84/94] update Netherlands info --- backend/corpora/parliament/description/netherlands.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/corpora/parliament/description/netherlands.md b/backend/corpora/parliament/description/netherlands.md index 696670a8b..2d4ce45fa 100644 --- a/backend/corpora/parliament/description/netherlands.md +++ b/backend/corpora/parliament/description/netherlands.md @@ -1 +1 @@ -The debates of the First and Second Chamber of the bicameral parliament, enriched until the early 2010s by Maarten Marx for the Political Mashup project, and 2014-2020 by ParlaMINT. Metadata is provided. +The debates of the First and Second Chamber of the bicameral parliament, enriched until the early 2010s by Maarten Marx for the Political Mashup project, and 2014-2023 by ParlaMINT. Metadata is provided. From 8d6795809c138cf74cfa81781c6c468cbf17534d Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:12:34 +0200 Subject: [PATCH 85/94] update application disclaimer --- .../src/assets/about/en-GB/people-and-parliament.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/frontend/src/assets/about/en-GB/people-and-parliament.md b/frontend/src/assets/about/en-GB/people-and-parliament.md index 70296366c..7ab210e3e 100644 --- a/frontend/src/assets/about/en-GB/people-and-parliament.md +++ b/frontend/src/assets/about/en-GB/people-and-parliament.md @@ -1,5 +1,11 @@ -People & Parliament is a prototype of a comparative interface of digitised plenary parliamentary debates from several Northwest European countries. It has been constructed by the Academy of Finland Professor (AP) Project Political Representation: Tensions between Parliament and the People from the Age of Revolutions to the 21st Century (2021-2026, see [project decription](https://www.jyu.fi/hytk/fi/laitokset/hela/en/research/political-representation)) at the University of Jyväskylä, Finland, in cooperation with the Digital Humanities Lab of Utrecht University, The Netherlands. The construction has been based a dialogue between state-of-the-art text-mining techniques of the early 2020s and research in political history with a particular emphasis on comparative conceptual history. +*People & Parliament* is a comparative interface of digitised plenary parliamentary debates from several Northwest European countries. It has been constructed by the Academy of Finland Professor (AP) Project Political Representation: Tensions between Parliament and the People from the Age of Revolutions to the 21st Century (2021-2026, see [project decription](https://www.jyu.fi/hytk/fi/laitokset/hela/en/research/political-representation)) at the University of Jyväskylä, Finland, in cooperation with the Digital Humanities Lab of Utrecht University, The Netherlands. The construction has been based a dialogue between state-of-the-art text-mining techniques of the early 2020s and research in political history with a particular emphasis on comparative conceptual history. Data in the interface has been collected from a number of national sources and its quality varies from datasets in almost immediately research-ready state to ones that need extensive processing. Remaining quality problems are due to OCR errors, differing data structures and the availability of metadata. The AP project has not been able to solve all these issues on behalf of national institutions. It has not been possible to apply a unified data model for all the various national datasets, and each has hence been handled individually. Our recommendation is that the different corpora are primarily analysed in their national contexts and relative term frequencies from different countries, for instance, are not directly compared. -Currently People & Parliament contains plenary parliamentary debates from the following countries: Canada, Denmark, France, Germany, the Netherlands, Norway, Sweden and the United Kingdom. We are grateful for research partners that have made parts of the data available to us in an enriched form. +Currently *People & Parliament* contains plenary parliamentary debates from the following countries: Canada, Denmark, Finland, France, Germany, Ireland, the Netherlands, Norway, Sweden and the United Kingdom. We are grateful for research partners that have made parts of the data available to us in an enriched form. + +Please cite this interface as follows: +> Ihalainen, Pasi; Vaara, Ville; Bonin, Hugo; Turunen, Risto; Janssen, Berit; Marjanen, Jani; Van der Plas, Luka; Van Stiphout, Mees: *People & Parliament: A Comparative Interface on Parliamentary Debates in Northwest Europe since the Nineteenth Century* + +Contributors: +Jussi Kurunmäki and Zachris Haaparinne \ No newline at end of file From 3c9e0f750464f26a4bb288b02d6339ec9f9661b1 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:18:18 +0200 Subject: [PATCH 86/94] fix: use paths-ignore --- .github/workflows/backend-test.yml | 5 +++-- .github/workflows/frontend-test.yml | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml index b787ff68c..2b87893d4 100644 --- a/.github/workflows/backend-test.yml +++ b/.github/workflows/backend-test.yml @@ -13,8 +13,9 @@ on: - 'hotfix/**' - 'release/**' - 'dependabot/**' - paths: - - 'backend/**' + paths-ignore: + - 'frontend/**' + - '**.md' jobs: backend-test: diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml index f2fd8fed2..bcbea4be5 100644 --- a/.github/workflows/frontend-test.yml +++ b/.github/workflows/frontend-test.yml @@ -13,8 +13,9 @@ on: - 'hotfix/**' - 'release/**' - 'dependabot/**' - paths: + paths-ignore: - 'frontend/**' + - '**.md' jobs: frontend-test: From 49840ea7f30d273bb5f4b777fc5b78d0f1c2a999 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:24:54 +0200 Subject: [PATCH 87/94] fix: correct paths-ignore in frontend test --- .github/workflows/frontend-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml index bcbea4be5..fdb14f20e 100644 --- a/.github/workflows/frontend-test.yml +++ b/.github/workflows/frontend-test.yml @@ -14,7 +14,7 @@ on: - 'release/**' - 'dependabot/**' paths-ignore: - - 'frontend/**' + - 'backend/**' - '**.md' jobs: From 22ca665c0e7a092ac0fc0c576ec6aa3fb1c36bf6 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:25:12 +0200 Subject: [PATCH 88/94] revertme: check if README triggers test actions --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c4259bf85..b147b8d8b 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ I-analyzer is a web application for exploring corpora (large collections of text I-analyzer is primarily intended for academic research and higher education. We focus on data that is relevant for the humanities, but we are open to datasets that are relevant for other fields. +Editing this README to test if this triggers an action. + ## Contents This repository contains the source code for the I-analyzer web application, which consists of a Django backend and Angular frontend. From f75693cac1b8c565de440f71792bfc1d67bbbb6a Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:26:21 +0200 Subject: [PATCH 89/94] Revert "revertme: check if README triggers test actions" This reverts commit 22ca665c0e7a092ac0fc0c576ec6aa3fb1c36bf6. --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 56da2c8ba..8fd6080d8 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,6 @@ I-analyzer is a web application for exploring corpora (large collections of text I-analyzer is primarily intended for academic research and higher education. We focus on data that is relevant for the humanities, but we are open to datasets that are relevant for other fields. -Editing this README to test if this triggers an action. - ## Contents This repository contains the source code for the I-analyzer web application, which consists of a Django backend and Angular frontend. From 8e9d78fb373cac1fd48cf669ff4cbb1e8c59968f Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:27:38 +0200 Subject: [PATCH 90/94] revertme: check if editing a change to any other .md file triggers action --- backend/corpora/parliament/description/sweden.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/corpora/parliament/description/sweden.md b/backend/corpora/parliament/description/sweden.md index 661e9b8ea..ab5b635f7 100644 --- a/backend/corpora/parliament/description/sweden.md +++ b/backend/corpora/parliament/description/sweden.md @@ -1 +1,3 @@ The debates of the First and Second Chambers of the bicameral parliament Riksdag until 1971, and the debates of the unicameral parliament Riksdag since 1971, enriched for the period 1920-2020 by Fredrik Norén and his team at HumLab, Umeå University, Sweden. + +Checking here, too. \ No newline at end of file From 064ae30cd354678da3f19137c87eb25db09ca12e Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 29 May 2024 15:28:00 +0200 Subject: [PATCH 91/94] Revert "revertme: check if editing a change to any other .md file triggers action" This reverts commit 8e9d78fb373cac1fd48cf669ff4cbb1e8c59968f. --- backend/corpora/parliament/description/sweden.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/corpora/parliament/description/sweden.md b/backend/corpora/parliament/description/sweden.md index ab5b635f7..661e9b8ea 100644 --- a/backend/corpora/parliament/description/sweden.md +++ b/backend/corpora/parliament/description/sweden.md @@ -1,3 +1 @@ The debates of the First and Second Chambers of the bicameral parliament Riksdag until 1971, and the debates of the unicameral parliament Riksdag since 1971, enriched for the period 1920-2020 by Fredrik Norén and his team at HumLab, Umeå University, Sweden. - -Checking here, too. \ No newline at end of file From 0e094836682903ca6d548fda1a64b29979a9e34f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 4 Jun 2024 19:13:50 +0200 Subject: [PATCH 92/94] add page titles --- frontend/src/app/about/about.component.ts | 6 ++++-- .../src/app/corpus-info/corpus-info.component.ts | 11 +++++++++-- .../src/app/document-page/document-page.component.ts | 4 ++++ .../download-history/download-history.component.ts | 6 +++++- .../search-history/search-history.component.ts | 8 ++++++-- frontend/src/app/home/home.component.ts | 5 +++-- frontend/src/app/login/login.component.html | 2 +- frontend/src/app/login/login.component.ts | 3 ++- .../app/login/registration/registration.component.ts | 9 +++++++-- .../login/reset-password/request-reset.component.ts | 12 ++++++++++-- .../reset-password/reset-password.component.html | 2 +- .../login/reset-password/reset-password.component.ts | 11 ++++++----- .../app/login/verify-email/verify-email.component.ts | 6 +++++- frontend/src/app/manual/manual.component.ts | 9 +++++++-- frontend/src/app/menu/menu.component.ts | 6 +----- frontend/src/app/privacy/privacy.component.html | 2 +- frontend/src/app/privacy/privacy.component.ts | 11 +++++++---- frontend/src/app/search/search.component.ts | 4 ++++ frontend/src/app/settings/settings.component.ts | 12 +++++++++--- .../app/tag/tag-overview/tag-overview.component.ts | 6 +++++- frontend/src/app/utils/app.ts | 4 ++++ .../src/app/word-models/word-models.component.ts | 5 ++++- 22 files changed, 105 insertions(+), 39 deletions(-) create mode 100644 frontend/src/app/utils/app.ts diff --git a/frontend/src/app/about/about.component.ts b/frontend/src/app/about/about.component.ts index 5c76d2ca5..3699bfcf5 100644 --- a/frontend/src/app/about/about.component.ts +++ b/frontend/src/app/about/about.component.ts @@ -1,5 +1,5 @@ import { Component, OnInit } from '@angular/core'; -import { SafeHtml } from '@angular/platform-browser'; +import { SafeHtml, Title } from '@angular/platform-browser'; import { environment } from '../../environments/environment'; import { DialogService } from '../services'; @@ -13,12 +13,14 @@ export class AboutComponent implements OnInit { public aboutHtml: SafeHtml; public isLoading = false; - constructor(private dialogService: DialogService) { } + constructor(private dialogService: DialogService, private title: Title) { + } ngOnInit() { this.isLoading = true; this.appName = environment.appName; this.fetchData(); + this.title.setTitle(`About - ${this.appName}`); } async fetchData() { diff --git a/frontend/src/app/corpus-info/corpus-info.component.ts b/frontend/src/app/corpus-info/corpus-info.component.ts index 2c89aafb2..dd12cc57b 100644 --- a/frontend/src/app/corpus-info/corpus-info.component.ts +++ b/frontend/src/app/corpus-info/corpus-info.component.ts @@ -1,8 +1,10 @@ import { Component, OnInit } from '@angular/core'; -import { ApiService, CorpusService, WordmodelsService } from '../services'; +import { ApiService, CorpusService } from '../services'; import { Corpus, CorpusDocumentationPage, FieldCoverage } from '../models'; import { marked } from 'marked'; import { Observable } from 'rxjs'; +import { Title } from '@angular/platform-browser'; +import { pageTitle } from '../utils/app'; @Component({ selector: 'ia-corpus-info', @@ -16,7 +18,11 @@ export class CorpusInfoComponent implements OnInit { documentation$: Observable; - constructor(private corpusService: CorpusService, private apiService: ApiService, private wordModelsService: WordmodelsService) { } + constructor( + private corpusService: CorpusService, + private apiService: ApiService, + private title: Title, + ) { } ngOnInit(): void { this.corpusService.currentCorpus.subscribe(this.setCorpus.bind(this)); @@ -28,6 +34,7 @@ export class CorpusInfoComponent implements OnInit { this.apiService.fieldCoverage(corpus.name).then( result => this.fieldCoverage = result ); + this.title.setTitle(pageTitle(`About ${corpus.title}`)); } renderMarkdown(content: string): string { diff --git a/frontend/src/app/document-page/document-page.component.ts b/frontend/src/app/document-page/document-page.component.ts index 74bb001eb..4d44a7d61 100644 --- a/frontend/src/app/document-page/document-page.component.ts +++ b/frontend/src/app/document-page/document-page.component.ts @@ -6,6 +6,8 @@ import { Corpus, FoundDocument } from '../models'; import { CorpusService, ElasticSearchService } from '../services'; import { makeContextParams } from '../utils/document-context'; import { documentIcons } from '../shared/icons'; +import { Title } from '@angular/platform-browser'; +import { pageTitle } from '../utils/app'; @Component({ selector: 'ia-document-page', @@ -25,6 +27,7 @@ export class DocumentPageComponent implements OnInit { private corpusService: CorpusService, private elasticSearchService: ElasticSearchService, private activatedRoute: ActivatedRoute, + private title: Title, ) { } get contextDisplayName(): string { @@ -53,6 +56,7 @@ export class DocumentPageComponent implements OnInit { this.corpus = corpus; this.documentId = params['id']; this.getDocument(this.documentId); + this.title.setTitle(pageTitle(`Document in ${corpus.title}`)); }); } diff --git a/frontend/src/app/history/download-history/download-history.component.ts b/frontend/src/app/history/download-history/download-history.component.ts index 46f996c3e..eadc44b8b 100644 --- a/frontend/src/app/history/download-history/download-history.component.ts +++ b/frontend/src/app/history/download-history/download-history.component.ts @@ -6,6 +6,8 @@ import { HistoryDirective } from '../history.directive'; import { findByName } from '../../utils/utils'; import { actionIcons } from '../../shared/icons'; import { downloadQueryModel, downloadQueryModels } from '../../utils/download-history'; +import { Title } from '@angular/platform-browser'; +import { pageTitle } from '../../utils/app'; @Component({ selector: 'ia-download-history', @@ -23,12 +25,14 @@ export class DownloadHistoryComponent extends HistoryDirective implements OnInit private downloadService: DownloadService, private apiService: ApiService, corpusService: CorpusService, - private notificationService: NotificationService + private notificationService: NotificationService, + private title: Title, ) { super(corpusService); } ngOnInit(): void { + this.title.setTitle(pageTitle('Downloads')); this.retrieveCorpora(); this.apiService.downloads() .then(downloadHistory => this.downloads = this.sortByDate(downloadHistory)) diff --git a/frontend/src/app/history/search-history/search-history.component.ts b/frontend/src/app/history/search-history/search-history.component.ts index 440b9441a..62d1a8817 100644 --- a/frontend/src/app/history/search-history/search-history.component.ts +++ b/frontend/src/app/history/search-history/search-history.component.ts @@ -1,5 +1,5 @@ import { Component, OnInit } from '@angular/core'; -import { Params, Router } from '@angular/router'; +import { Params } from '@angular/router'; import * as _ from 'lodash'; import { apiQueryToQueryModel } from '../../utils/es-query'; import { QueryDb } from '../../models/index'; @@ -7,6 +7,8 @@ import { CorpusService, QueryService } from '../../services/index'; import { HistoryDirective } from '../history.directive'; import { findByName } from '../../utils/utils'; import { actionIcons } from '../../shared/icons'; +import { Title } from '@angular/platform-browser'; +import { pageTitle } from '../../utils/app'; @Component({ selector: 'ia-search-history', @@ -22,12 +24,14 @@ export class SearchHistoryComponent extends HistoryDirective implements OnInit { constructor( corpusService: CorpusService, private queryService: QueryService, - private router: Router + private title: Title, + ) { super(corpusService); } async ngOnInit() { + this.title.setTitle(pageTitle('Search history')); this.retrieveCorpora(); this.queryService.retrieveQueries().then((searchHistory) => { const sortedQueries = this.sortByDate(searchHistory); diff --git a/frontend/src/app/home/home.component.ts b/frontend/src/app/home/home.component.ts index 0b02acb85..b52d53e7e 100644 --- a/frontend/src/app/home/home.component.ts +++ b/frontend/src/app/home/home.component.ts @@ -4,6 +4,7 @@ import { BehaviorSubject } from 'rxjs'; import { Corpus } from '../models/corpus'; import { CorpusService } from '../services/index'; import { showLoading } from '../utils/utils'; +import { environment } from '../../environments/environment'; @Component({ selector: 'ia-home', @@ -15,8 +16,8 @@ export class HomeComponent implements OnInit { isLoading = new BehaviorSubject(false); - constructor(private corpusService: CorpusService, private title: Title) { - this.title.setTitle('Home'); + constructor(private corpusService: CorpusService, title: Title) { + title.setTitle(environment.appName); } ngOnInit() { diff --git a/frontend/src/app/login/login.component.html b/frontend/src/app/login/login.component.html index 7dda417c5..7864f4eee 100644 --- a/frontend/src/app/login/login.component.html +++ b/frontend/src/app/login/login.component.html @@ -2,7 +2,7 @@