diff --git a/marc_to_solr/lib/genre.rb b/marc_to_solr/lib/genre.rb index e811e380..b367f14d 100644 --- a/marc_to_solr/lib/genre.rb +++ b/marc_to_solr/lib/genre.rb @@ -12,8 +12,8 @@ def initialize(record) # 655 $a, $v, $x filtered def to_a @as_array ||= ( - genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v - ).uniq + genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v + genres_from_primary_source_mapping + genres_from_autobiography + ).compact.uniq end private @@ -60,6 +60,43 @@ def genres_from_subfield_v end end + def genres_from_primary_source_mapping + potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|vx:651|*0|vx:655|*0|a:655|*0|vx').collect_matching_lines(record) do |field, spec, extractor| + extractor.collect_subfields(field, spec) + end + if potential_genres.any? { |genre| genre_term_indicates_primary_source? genre } && !literary_work? + ['Primary source'] + else + [] + end + end + + def genres_from_autobiography + if biography? && author_matches_subject? && !literary_work? + ['Primary source'] + else + [] + end + end + + def genre_term_indicates_primary_source?(genre) + normalized_genre = genre.downcase.strip.delete_suffix('.') + primary_source_genres.any? { |primary_source_genre| normalized_genre.include? primary_source_genre } + end + + def biography? + potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|avx:651|*0|vx:655|*0|avx').collect_matching_lines(record) do |field, spec, extractor| + extractor.collect_subfields(field, spec) + end + potential_genres.include?('Biography') + end + + def author_matches_subject? + authors = Traject::MarcExtractor.cached('100abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip } + name_subjects = Traject::MarcExtractor.cached('600abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip } + authors.any? { |author| name_subjects.include? author } + end + def likely_genre_term term genre_terms.include?(term) || genre_starting_terms.any? { |potential| term.start_with? potential } end @@ -109,4 +146,36 @@ def genre_starting_terms 'Translations into ' ] end + + def primary_source_genres + [ + 'atlases', + 'charters', + 'correspondence', + 'diaries', + 'documents', + 'interview', + 'interviews', + 'letters', + 'manuscripts', + 'maps', + 'notebooks, sketchbooks, etc', + 'oral history', + 'pamphlets', + 'personal narratives', + 'photographs', + 'pictorial works', + 'sources', + 'speeches', + 'statistics' + ] + end + + def literary_work? + book? && record.fields('008').any? { |litf| %w[1 d e f j p].include? litf.value[33] } + end + + def book? + record.leader && record.leader[6..7]&.match?(/a[acdim]/) + end end diff --git a/spec/fixtures/marc_to_solr/99117267623506421.mrx b/spec/fixtures/marc_to_solr/99117267623506421.mrx new file mode 100644 index 00000000..6d57ef35 --- /dev/null +++ b/spec/fixtures/marc_to_solr/99117267623506421.mrx @@ -0,0 +1,388 @@ + + + + 04137cam a2200853Ii 4500 + 20240812122717.0 + 191113s2019 cc fh b 000 0achi d + 99117267623506421 + + 9787101138320 + + + 7101138322 + + + (NjP)11726762-princetondb + + + (OCoLC)on1101985602 + + + (NjP)Voyager11726762 + + + BCBTC + eng + rda + BCBTC + OCLCQ + CLU + OCLCF + ZYU + CGU + PUL + + + a-cc--- + + + PULL + + + DS757.23.H427 + A3 2019 + + + 880-01 + He, Rulin, + 1618-1689 + author. + + + 100-01 + 何汝霖, + 1618-1689, + author. + + + 880-02 + Wan Qing jun ji da chen ri ji wu zhong / + [Qing] He Rulin, Ji Zhichang, Shen Zhaolin, Xu Gengshen, Liao + Shouheng zhuan ; Zhang Jian, Zheng Yuan zheng li. + + + 245-02 + 晚清軍機大臣日記五種 / + [清]何汝霖, 季芝昌, 沈兆霖, 许庚身, 廖寿恒撰 ; 张剑, 郑园整理. + + + Wan Qing jun ji da chen ri ji 5 zhong + + + 880-03 + Beijing di 1 ban. + + + 250-03 + 北京第1版. + + + 880-04 + Beijing Shi : + Zhonghua shu ju, + 2019. + + + 264-04 + 北京市 : + 中华书局, + 2019. + + + 2 volumes (2, 11, 14, 964 pages), 6 unnumbered pages of plates : + facsimiles (some color) ; + 21 cm. + + + text + txt + rdacontent + + + unmediated + n + rdamedia + + + volume + nc + rdacarrier + + + 880-05 + Zhongguo jin dai ren wu ri ji cong shu + + + 490-05 + 中国近代人物日记丛书 + + + Includes bibliographical references. + + + 880-06 + He, Rulin, + 1618-1689 + Diaries. + + + 600-06 + 何汝霖, + 1618-1689 + Diaries. + + + 880-07 + Ji, Zhichang, + 1791-1861 + Diaries. + + + 600-07 + 季芝昌, + 1791-1861 + Diaries. + + + 880-08 + Shen, Zhaolin, + 1801-1862 + Diaries. + + + 600-08 + 沈兆霖, + 1801-1862 + Diaries. + + + 880-09 + Xu, Gengshen + Diaries. + + + 600-09 + 许庚身 + Diaries. + + + 880-10 + Liao, Shouheng + Diaries. + + + 600-10 + 廖寿恒 + Diaries. + + + Statesmen + China + Diaries. + + + China + Officials and employees + Diaries. + + + China + History + 1861-1912 + Sources. + + + Shen, Zhaolin, + 1801-1862. + fast + (OCoLC)fst00173986 + + + Employees. + fast + (OCoLC)fst00909111 + + + Statesmen. + fast + (OCoLC)fst01131990 + + + China. + fast + (OCoLC)fst01206073 + + + 1861-1912 + fast + + + Diaries. + fast + (OCoLC)fst01423794 + + + History. + fast + (OCoLC)fst01411628 + + + Sources. + fast + (OCoLC)fst01423900 + + + 880-11 + Ji, Zhichang, + 1791-1861, + author. + + + 700-11 + 季芝昌, + 1791-1861, + author. + + + 880-12 + Shen, Zhaolin, + 1801-1862, + author. + + + 700-12 + 沈兆霖, + 1801-1862, + author. + + + 880-13 + Xu, Gengshen, + author. + + + 700-13 + 许庚身, + author. + + + 880-14 + Liao, Shouheng, + author. + + + 700-14 + 廖寿恒, + author. + + + 880-15 + Zhang, Jian, + editor. + + + 700-15 + 张剑, + editor. + + + 880-16 + Zheng, Yuan, + editor. + + + 700-16 + 郑园, + editor. + + + 880-17 + Zhongguo jin dai ren wu ri ji cong shu + + + 830-17 + 中国近代人物日记丛书. + + + gml + m + a + m + v + 2 + 20210122 + + + ww + o + m + t + 20191204 + + + (OCoLC)on1101985602 + OCoLC + match + 20221117 + processed + 1101985602 + + + Beijing Zhenben Technology Co. Ltd. + BCBT + bcbtc20190516031 + + + C0 + PUL + + + 11726762 + + + 2024-09-12 06:09:05 US/Eastern + 2021-07-12 18:45:56 US/Eastern + false + + + eastasian + cjk + DS757.23.H427 + A3 2019 + 22592871540006421 + + + Vol.1-v.2 + 22592871540006421 + + + 2021-07-12 22:45:56 + 22592871540006421 + East Asian Library + cjk: East Asian Library + false + + + 22592871540006421 + 23592871530006421 + vol.1 + 1 + cjk + 2020-12-17 19:00:00 US/Eastern + 32101108406198 + 0 + eastasian + + + 22592871540006421 + 23592871520006421 + vol.2 + 1 + cjk + 2021-01-27 19:00:00 US/Eastern + 32101104612039 + 0 + eastasian + + + diff --git a/spec/marc_to_solr/lib/config_spec.rb b/spec/marc_to_solr/lib/config_spec.rb index f2860147..07eeb127 100644 --- a/spec/marc_to_solr/lib/config_spec.rb +++ b/spec/marc_to_solr/lib/config_spec.rb @@ -89,6 +89,7 @@ def fixture_record(fixture_name, indexer: @indexer) @iso639_3_with_macrolanguage = @indexer.map_record(fixture_record('9930372403506421')) @indigenous_studies_mexico = @indexer.map_record(fixture_record('99125398364906421')) @dissertation_with_embargo = @indexer.map_record(fixture_record('99127127233306421')) + @diary = @indexer.map_record(fixture_record('99117267623506421')) end describe "alma loading" do @@ -1331,6 +1332,14 @@ def fixture_record(fixture_name, indexer: @indexer) end end + describe 'genre_facet' do + context 'when the document is a diary' do + it 'indexes the facet "Primary source"' do + expect(@diary['genre_facet']).to include 'Primary source' + end + end + end + describe 'private recap items' do it "skips indexing record if only item is private" do expect(@scsb_private).to be nil diff --git a/spec/marc_to_solr/lib/genre_spec.rb b/spec/marc_to_solr/lib/genre_spec.rb index 16ca6f74..11f7377d 100644 --- a/spec/marc_to_solr/lib/genre_spec.rb +++ b/spec/marc_to_solr/lib/genre_spec.rb @@ -43,5 +43,126 @@ expect(genres).not_to include("Join") expect(genres).not_to include("Dramatic renditon") end + + it 'defaults to not including Primary source' do + expect(genres).not_to include("Primary source") + end + + context 'when the 650 subfield x has Correspondence' do + let(:genres) do + g650 = { "650" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Authors" }, { "x" => "Correspondence" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g650]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 600 subfield x has Notebooks, sketchbooks, etc.' do + let(:genres) do + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g600]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when there is an extra space after the period "etc. "' do + let(:genres) do + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc. " }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g600]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 650 subfield v has Pamphlets' do + let(:genres) do + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Franco-Prussian War, 1870-1871" }, { "v" => "Pamphlets." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g650]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 650 subfield v has Pamphlets, and the 008 says it is a novel' do + let(:genres) do + g008 = { "008" => 'f'.rjust(34) } # f = Novel when it is in the 33rd position + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Franco-Prussian War, 1870-1871" }, { "v" => "Pamphlets." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g008, g650], 'leader' => '04137cam a2200853Ii 4500') + described_class.new(sample_marc).to_a + end + it 'does not include Primary Source in the list of genres' do + expect(genres).not_to include('Primary source') + end + end + context 'when the 650 subfield v has Pamphlets, and the 008 says it is non-fiction' do + let(:genres) do + g008 = { "008" => '0'.rjust(34) } # 0 = Nonfiction when it is in the 33rd position + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Franco-Prussian War, 1870-1871" }, { "v" => "Pamphlets." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g008, g650], 'leader' => '04137cam a2200853Ii 4500') + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 651 subfield x has Pictorial works' do + let(:genres) do + g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Iran" }, { "x" => "Description and travel" }, { "y" => "20th century" }, { "x" => "Pictorial works." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g651]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 650 subfield a is Biography' do + let(:genres) do + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g650]) + described_class.new(sample_marc).to_a + end + it 'does not include Primary source' do + expect(genres).not_to include('Primary source') + end + end + context 'when the 650 subfield a is Biography and the 600 and 100 match' do + let(:genres) do + g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil," }, { "e" => "author" }] } } + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil." }] } } + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g650]) + described_class.new(sample_marc).to_a + end + it 'includes Primary source' do + expect(genres).to include 'Primary source' + end + end + context 'when the 651 subfield v is Biography' do + let(:genres) do + g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g651]) + described_class.new(sample_marc).to_a + end + it 'does not include Primary source' do + expect(genres).not_to include('Primary source') + end + end + context 'when the 651 subfield v is Biography, and the 600 and 100 match' do + let(:genres) do + g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }, { "0" => "http://id.loc.gov/authorities/names/n83057391" }] } } + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }] } } + g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g651]) + described_class.new(sample_marc).to_a + end + it 'includes Primary source' do + expect(genres).to contain_exactly 'Primary source', 'Biography' + end + end end end