Skip to content

Commit

Permalink
Refactor harvester/indexer interface and remove solrj binding
Browse files Browse the repository at this point in the history
  • Loading branch information
wwelling committed Aug 9, 2023
1 parent 154df64 commit fb7144b
Show file tree
Hide file tree
Showing 21 changed files with 179 additions and 577 deletions.
1 change: 0 additions & 1 deletion solr/configsets/scholars-discovery/conf/managed-schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@
-->

<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="class" type="string" indexed="true" stored="true" required="true" multiValued="false" />

<!-- docValues are enabled by default for long type so we don't need to index the version field -->
<field name="_version_" type="plong" indexed="false" stored="false"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ public class DiscoveryConstants {

public static final String MOD_TIME = "modTime";

public static final String SYNC_IDS = "syncIds";

public static final String QUERY_DELIMETER = ":";

public static final String DEFAULT_QUERY = WILDCARD + QUERY_DELIMETER + WILDCARD;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package edu.tamu.scholars.middleware.discovery.component;

import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
import edu.tamu.scholars.middleware.discovery.model.Individual;
import reactor.core.publisher.Flux;

public interface Harvester {

public Flux<AbstractIndexDocument> harvest();
public Flux<Individual> harvest();

public AbstractIndexDocument harvest(String subject);
public Individual harvest(String subject);

public Class<AbstractIndexDocument> type();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import java.util.Collection;

import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
import edu.tamu.scholars.middleware.discovery.model.Individual;

public interface Indexer {

public void init();

public void index(Collection<AbstractIndexDocument> documents);
public void index(Collection<Individual> documents);

public void index(AbstractIndexDocument document);
public void index(Individual document);

public void optimize();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
package edu.tamu.scholars.middleware.discovery.component.jena;

import static edu.tamu.scholars.middleware.discovery.DiscoveryConstants.CLASS;
import static edu.tamu.scholars.middleware.discovery.DiscoveryConstants.ID;
import static edu.tamu.scholars.middleware.discovery.DiscoveryConstants.NESTED_DELIMITER;
import static edu.tamu.scholars.middleware.discovery.DiscoveryConstants.SYNC_IDS;

import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.ParameterizedType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

Expand All @@ -32,6 +36,7 @@
import edu.tamu.scholars.middleware.discovery.annotation.FieldType;
import edu.tamu.scholars.middleware.discovery.component.Harvester;
import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
import edu.tamu.scholars.middleware.discovery.model.Individual;
import edu.tamu.scholars.middleware.service.TemplateService;
import edu.tamu.scholars.middleware.service.Triplestore;
import reactor.core.publisher.Flux;
Expand Down Expand Up @@ -66,22 +71,21 @@ public TriplestoreHarvester(Class<AbstractIndexDocument> type) {
this.indexedFields = FieldUtils.getFieldsListWithAnnotation(type, FieldType.class);
}

public Flux<AbstractIndexDocument> harvest() {
public Flux<Individual> harvest() {
CollectionSource source = type.getAnnotation(CollectionSource.class);
String query = templateService.templateSparql(COLLECTION_SPARQL_TEMPLATE, source.predicate());
if (logger.isDebugEnabled()) {
logger.debug(String.format("%s:\n%s", COLLECTION_SPARQL_TEMPLATE, query));
}
logger.debug(String.format("%s:\n%s", COLLECTION_SPARQL_TEMPLATE, query));
QueryExecution queryExecution = triplestore.createQueryExecution(query);
Iterator<Triple> tripleIterator = queryExecution.execConstructTriples();
Iterable<Triple> triples = () -> tripleIterator;

return Flux.fromIterable(triples)
.map(this::subject)
.map(this::harvest)
.doFinally(onFinally -> queryExecution.close());
}

public AbstractIndexDocument harvest(String subject) {
public Individual harvest(String subject) {
try {
return createDocument(subject);
} catch (Exception e) {
Expand All @@ -102,22 +106,36 @@ private String subject(Triple triple) {
return triple.getSubject().toString();
}

private AbstractIndexDocument createDocument(String subject) throws InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException {
AbstractIndexDocument document = construct();
Field field = FieldUtils.getField(type, ID, true);
field.set(document, parse(subject));
lookupProperties(document, subject);
lookupSyncIds(document);
return document;
private Individual createDocument(String subject) throws InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException {
Individual individual = new Individual();
individual.setId(parse(subject));
individual.setClazz(name());

lookupProperties(individual, subject);
lookupSyncIds(individual);

Map<String, Collection<Object>> content = individual.getContent();
content.put(ID, Arrays.asList(individual.getId()));
content.put(CLASS, Arrays.asList(individual.getClazz()));
content.put(SYNC_IDS, new ArrayList<>(individual.getSyncIds()));

return individual;
}

private void lookupProperties(AbstractIndexDocument document, String subject) {
private void lookupProperties(Individual individual, String subject) {
Map<String, Collection<Object>> content = individual.getContent();
propertySourceTypeOps.parallelStream().forEach(typeOp -> {
try {
FieldSource source = typeOp.getPropertySource();
Model model = queryForModel(source, subject);
List<Object> values = lookupProperty(typeOp, source, model);
populate(document, typeOp.getField(), values);

if (values.isEmpty()) {
logger.debug(String.format("Could not find values for %s", typeOp.getField().getName()));
} else {
content.put(typeOp.getField().getName(), values);
}

} catch (Exception e) {
logger.error(String.format("Unable to populate document %s: %s", name(), parse(subject)));
logger.error(String.format("Error: %s", e.getMessage()));
Expand All @@ -130,9 +148,7 @@ private void lookupProperties(AbstractIndexDocument document, String subject) {

private Model queryForModel(FieldSource source, String subject) {
String query = templateService.templateSparql(source.template(), subject);
if (logger.isDebugEnabled()) {
logger.debug(String.format("%s:\n%s", source.template(), query));
}
logger.debug(String.format("%s:\n%s", source.template(), query));
try (QueryExecution qe = triplestore.createQueryExecution(query)) {
Model model = qe.execConstruct();
if (logger.isDebugEnabled()) {
Expand Down Expand Up @@ -187,44 +203,17 @@ private List<Object> queryForProperty(TypeOp typeOp, FieldSource source, Model m
return values;
}

private void populate(AbstractIndexDocument document, Field field, List<Object> values) throws IllegalArgumentException, IllegalAccessException {
if (values.isEmpty()) {
if (logger.isDebugEnabled()) {
logger.debug(String.format("Could not find values for %s", field.getName()));
}
} else {
field.setAccessible(true);
if (List.class.isAssignableFrom(field.getType())) {
field.set(document, values);
} else {
field.set(document, values.get(0));
}
}
}

@SuppressWarnings("unchecked")
private void lookupSyncIds(AbstractIndexDocument document) {
private void lookupSyncIds(Individual individual) {
Set<String> syncIds = new HashSet<String>();
syncIds.add(document.getId());
syncIds.add(individual.getId());
indexedFields.stream().filter(this::isNestedField).peek(field -> field.setAccessible(true)).forEach(field -> {
try {
Object value = field.get(document);
if (value != null) {
if (Collection.class.isAssignableFrom(field.getType())) {
((Collection<String>) value).forEach(v -> addSyncId(syncIds, v));
} else {
addSyncId(syncIds, (String) value);
}
}
} catch (IllegalArgumentException | IllegalAccessException e) {
logger.error(String.format("Unable to get value of %s %s", name(), field.getName()));
logger.error(String.format("Error: %s", e.getMessage()));
if (logger.isDebugEnabled()) {
e.printStackTrace();
}
String name = field.getName();
Collection<Object> value = individual.getContent().get(name);
if (value != null) {
value.forEach(v -> addSyncId(syncIds, (String) v));
}
});
document.setSyncIds(new ArrayList<>(syncIds));
individual.setSyncIds(new ArrayList<>(syncIds));
}

private boolean isNestedField(Field field) {
Expand All @@ -238,10 +227,6 @@ private void addSyncId(Set<String> syncIds, String value) {
}
}

private AbstractIndexDocument construct() throws InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException {
return type.getConstructor().newInstance(new Object[0]);
}

private String name() {
return type.getSimpleName();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import edu.tamu.scholars.middleware.discovery.annotation.FieldType;
import edu.tamu.scholars.middleware.discovery.component.Indexer;
import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
import edu.tamu.scholars.middleware.discovery.model.Individual;

public class SolrIndexer implements Indexer {

Expand Down Expand Up @@ -79,7 +80,7 @@ public void init() {
}

@Override
public void index(Collection<AbstractIndexDocument> documents) {
public void index(Collection<Individual> documents) {
try {
solrClient.addBeans(COLLECTION, documents);
solrClient.commit(COLLECTION);
Expand All @@ -91,7 +92,7 @@ public void index(Collection<AbstractIndexDocument> documents) {
}

@Override
public void index(AbstractIndexDocument document) {
public void index(Individual document) {
try {
solrClient.addBean(COLLECTION, document);
solrClient.commit(COLLECTION);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
package edu.tamu.scholars.middleware.discovery.model;

import static edu.tamu.scholars.middleware.discovery.DiscoveryConstants.CLASS;

import java.util.ArrayList;
import java.util.List;

import org.apache.solr.client.solrj.beans.Field;

import com.fasterxml.jackson.annotation.JsonProperty;

import edu.tamu.scholars.middleware.discovery.annotation.FieldSource;
import edu.tamu.scholars.middleware.discovery.annotation.FieldType;

public abstract class AbstractIndexDocument {

@Field
@FieldType(required = true, readonly = true)
private String id;

@JsonProperty("class")
@Field("class")
@FieldType(type = "string", value = "class", required = true)
@JsonProperty(CLASS)
@FieldType(type = "string", value = CLASS, required = true)
private String clazz = this.getClass().getSimpleName();

@Field
@FieldType(type = "whole_strings")
@FieldSource(template = "common/type", predicate = "http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType", parse = true)
private List<String> type;

@Field
@FieldType(type = "strings")
private List<String> syncIds = new ArrayList<>();

@FieldType(type = "pdate")
@FieldSource(template = "common/modTime", predicate = "http://vitro.mannlib.cornell.edu/ns/vitro/0.7#modTime")
private String modTime;

public String getId() {
return id;
}
Expand Down Expand Up @@ -62,4 +62,12 @@ public void setSyncIds(List<String> syncIds) {
this.syncIds = syncIds;
}

public String getModTime() {
return modTime;
}

public void setModTime(String modTime) {
this.modTime = modTime;
}

}
Loading

0 comments on commit fb7144b

Please sign in to comment.