Skip to content

Commit

Permalink
Add first prov implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
NicoLaval committed Sep 18, 2024
1 parent 02a068a commit 37b49db
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 62 deletions.
2 changes: 2 additions & 0 deletions vtl-prov/docs/model-v1.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ his document present how we could extract some provenance from a VTL program in

Based on `PROV-O` and `SDTH` ontologies.

_TODO: add kind of `cdi:RepresentedVariable`_

```mermaid
classDiagram
class Agent {
Expand Down
97 changes: 73 additions & 24 deletions vtl-prov/src/main/java/fr/insee/vtl/prov/ProvenanceListener.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,29 @@
import fr.insee.vtl.parser.VtlBaseListener;
import fr.insee.vtl.parser.VtlLexer;
import fr.insee.vtl.parser.VtlParser;
import fr.insee.vtl.prov.prov.DataframeInstance;
import fr.insee.vtl.prov.prov.Program;
import fr.insee.vtl.prov.prov.ProgramStep;
import fr.insee.vtl.prov.prov.VariableInstance;
import org.antlr.v4.runtime.*;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.tree.ParseTreeWalker;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* ANTLR Listener that create provenance objects.
*/
public class ProvenanceListener extends VtlBaseListener {

private Program program = new Program();
private Map<String, ProgramStep> programSteps = new HashMap<>();
private final Program program = new Program();

private String currentProgramStep;

private boolean isInDatasetClause;

private String currentComponentID;

public ProvenanceListener(String id, String programName) {
program.setId(id);
program.setLabel(programName);
Expand All @@ -45,11 +47,10 @@ public void enterTemporaryAssignment(VtlParser.TemporaryAssignmentContext ctx) {
String id = getText(ctx.varID());
String sourceCode = getText(ctx);
currentProgramStep = id;
if (!programSteps.containsKey(id)) {
ProgramStep programStep = new ProgramStep(id, sourceCode);
programSteps.put(id, programStep);
}
program.getProgramStepIds().add(id);
ProgramStep programStep = new ProgramStep(id, id, sourceCode);
DataframeInstance df = new DataframeInstance(id, id);
programStep.setProducedDataframe(df);
program.getProgramSteps().add(programStep);
}

@Override
Expand All @@ -62,11 +63,10 @@ public void enterPersistAssignment(VtlParser.PersistAssignmentContext ctx) {
String id = getText(ctx.varID());
String sourceCode = getText(ctx);
currentProgramStep = id;
if (!programSteps.containsKey(id)) {
ProgramStep programStep = new ProgramStep(id, sourceCode);
programSteps.put(id, programStep);
}
program.getProgramStepIds().add(id);
ProgramStep programStep = new ProgramStep(id, id, sourceCode);
DataframeInstance df = new DataframeInstance(id, id);
programStep.setProducedDataframe(df);
program.getProgramSteps().add(programStep);
}

@Override
Expand All @@ -76,27 +76,76 @@ public void exitPersistAssignment(VtlParser.PersistAssignmentContext ctx) {

@Override
public void enterVarID(VtlParser.VarIDContext ctx) {
String id = ctx.IDENTIFIER().getText();
if (!id.equals(currentProgramStep)) {
ProgramStep programStep = program.getProgramStepById(currentProgramStep);
if (!isInDatasetClause) {
Set<DataframeInstance> consumedDataframe = programStep.getConsumedDataframe();
DataframeInstance df = new DataframeInstance(id, id);
consumedDataframe.add(df);
}
if (isInDatasetClause && null != currentComponentID) {
Set<VariableInstance> usedVariables = programStep.getUsedVariables();
VariableInstance v = new VariableInstance(id, id);
usedVariables.add(v);
}
}
}

@Override
public void enterDatasetClause(VtlParser.DatasetClauseContext ctx) {
isInDatasetClause = true;
}

@Override
public void exitDatasetClause(VtlParser.DatasetClauseContext ctx) {
isInDatasetClause = false;
}

@Override
public void enterComponentID(VtlParser.ComponentIDContext ctx) {
String id = ctx.getText();
ProgramStep programStep = program.getProgramStepById(currentProgramStep);
Set<VariableInstance> assignedVariables = programStep.getAssignedVariables();
VariableInstance v = new VariableInstance(id, id);
assignedVariables.add(v);
}

@Override
public void enterCalcClauseItem(VtlParser.CalcClauseItemContext ctx) {
currentComponentID = getText(ctx.componentID());
}

@Override
public void exitCalcClauseItem(VtlParser.CalcClauseItemContext ctx) {
currentComponentID = null;
}

@Override
public void enterAggrFunctionClause(VtlParser.AggrFunctionClauseContext ctx) {
currentComponentID = getText(ctx.componentID());
}

@Override
public void exitAggrFunctionClause(VtlParser.AggrFunctionClauseContext ctx) {
currentComponentID = null;
}

/**
* Returns the provenance objects
* Returns the program object
*/
public List<Object> getObjects() {
List<Object> obj = new ArrayList<>();
obj.add(program);
obj.addAll(programSteps.values());
return obj;
public Program getProgram() {
return program;
}

public static List<Object> parseAndListen(String expr, String id, String programName) {
public static Program run(String expr, String id, String programName) {
CodePointCharStream stream = CharStreams.fromString(expr);
VtlLexer lexer = new VtlLexer(stream);
VtlParser parser = new VtlParser(new CommonTokenStream(lexer));

ProvenanceListener provenanceListener = new ProvenanceListener(id, programName);
ParseTreeWalker.DEFAULT.walk(provenanceListener, parser.start());
return provenanceListener.getObjects();
return provenanceListener.getProgram();
}

}
99 changes: 77 additions & 22 deletions vtl-prov/src/main/java/fr/insee/vtl/prov/RDFUtils.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package fr.insee.vtl.prov;

import fr.insee.vtl.prov.prov.DataframeInstance;
import fr.insee.vtl.prov.prov.Program;
import fr.insee.vtl.prov.prov.ProgramStep;
import fr.insee.vtl.prov.prov.VariableInstance;
import fr.insee.vtl.prov.utils.PROV;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
Expand All @@ -12,29 +14,26 @@
import org.apache.jena.vocabulary.RDF;
import org.apache.jena.vocabulary.RDFS;

import java.util.List;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Set;

public class RDFUtils {

private static final String TREVAS_BASE_URI = "http://trevas/";
private static final String SDTH_BASE_URI = "http://rdf-vocabulary.ddialliance.org/sdth#";

public static Model buildModel(List<Object> objects) {
public static Model buildModel(Program program) {
Model model = ModelFactory.createDefaultModel();
model.setNsPrefix("prov", PROV.getURI());
objects.forEach(o -> {
if (o instanceof Program) {
handleProgram(model, (Program) o);
}
if (o instanceof ProgramStep) {
handleProgramStep(model, (ProgramStep) o);
}
});
handleProgram(model, program);
return model;
}

public static void handleProgram(Model model, Program program) {
// Create Program URI, type, label, sourceCode
Resource SDTH_PROGRAM = model.createResource(SDTH_BASE_URI + "Program");
String id = program.getId();
String label = program.getLabel();
Expand All @@ -44,29 +43,75 @@ public static void handleProgram(Model model, Program program) {
String sourceCode = program.getSourceCode();
Property SDTH_HAS_SOURCE_CODE = model.createProperty(SDTH_BASE_URI + "hasSourceCode");
programURI.addProperty(SDTH_HAS_SOURCE_CODE, sourceCode);
Set<String> stepIds = program.getProgramStepIds();
stepIds.forEach(stepId -> {
Property SDTH_HAS_PROGRAM_STEP = model.createProperty(SDTH_BASE_URI + "hasProgramStep");
Resource SDTH_PROGRAM_STEP = model.createResource(SDTH_BASE_URI + "ProgramStep");
// Link and define ProgramSteps
Set<ProgramStep> programSteps = program.getProgramSteps();
Property SDTH_HAS_PROGRAM_STEP = model.createProperty(SDTH_BASE_URI + "hasProgramStep");
programSteps.forEach(step -> {
String stepId = step.getId();
Resource programStepURI = model.createResource(TREVAS_BASE_URI + "program-step/" + stepId);
programURI.addProperty(SDTH_HAS_PROGRAM_STEP, programStepURI);
programStepURI.addProperty(RDF.type, SDTH_PROGRAM_STEP);
programStepURI.addProperty(RDFS.label, "Create " + stepId + " dataset");
handleProgramStep(model, step);
});
}

public static void handleProgramStep(Model model, ProgramStep programStep) {
String label = programStep.getLabel();
Resource programStepURI = model.createResource(TREVAS_BASE_URI + "program-step/" + label);
// Create ProgramStep URI, type, label, sourceCode
String id = programStep.getId();
Resource programStepURI = model.createResource(TREVAS_BASE_URI + "program-step/" + id);
Resource SDTH_PROGRAM_STEP = model.createResource(SDTH_BASE_URI + "ProgramStep");
programStepURI.addProperty(RDF.type, SDTH_PROGRAM_STEP);
programStepURI.addProperty(RDFS.label, "Create " + id + " dataset");
String sourceCode = programStep.getSourceCode();
Property SDTH_HAS_SOURCE_CODE = model.createProperty(SDTH_BASE_URI + "hasSourceCode");
programStepURI.addProperty(SDTH_HAS_SOURCE_CODE, sourceCode);
Resource SDTH_DATAFRAME = model.createResource(SDTH_BASE_URI + "DataframeInstance");
Resource dfProducesURI = model.createResource(TREVAS_BASE_URI + "dataset/" + label);
dfProducesURI.addProperty(RDF.type, SDTH_DATAFRAME);
dfProducesURI.addProperty(RDFS.label, label);
// Link and define producedDF
DataframeInstance dfProduced = programStep.getProducedDataframe();
String dfProducedId = dfProduced.getId();
Resource dfProducesURI = model.createResource(TREVAS_BASE_URI + "dataset/" + dfProducedId);
Property SDTH_PRODUCES_DATAFRAME = model.createProperty(SDTH_BASE_URI + "producesDataframe");
programStepURI.addProperty(SDTH_PRODUCES_DATAFRAME, dfProducesURI);
handleDataframeInstance(model, dfProduced);
// Link and define consumedDF
Property SDTH_CONSUMES_DATAFRAME = model.createProperty(SDTH_BASE_URI + "consumesDataframe");
programStep.getConsumedDataframe().forEach(df -> {
Resource dfConsumedURI = model.createResource(TREVAS_BASE_URI + "dataset/" + df.getId());
programStepURI.addProperty(SDTH_CONSUMES_DATAFRAME, dfConsumedURI);
handleDataframeInstance(model, df);
});
// Link and define usedVariables
Property SDTH_USED_VARIABLE = model.createProperty(SDTH_BASE_URI + "usesVariable");
programStep.getUsedVariables().forEach(v -> {
Resource varUsedURI = model.createResource(TREVAS_BASE_URI + "variable/" + v.getId());
programStepURI.addProperty(SDTH_USED_VARIABLE, varUsedURI);
handleVariableInstance(model, v);
});
// Link and define assignedVariables
Property SDTH_ASSIGNED_VARIABLE = model.createProperty(SDTH_BASE_URI + "assignsVariable");
programStep.getAssignedVariables().forEach(v -> {
Resource varAssignedURI = model.createResource(TREVAS_BASE_URI + "variable/" + v.getId());
programStepURI.addProperty(SDTH_ASSIGNED_VARIABLE, varAssignedURI);
handleVariableInstance(model, v);
});
}

public static void handleDataframeInstance(Model model, DataframeInstance dfInstance) {
// Create DataframeInstance URI, type, label
String id = dfInstance.getId();
Resource dfURI = model.createResource(TREVAS_BASE_URI + "dataset/" + id);
Resource SDTH_DATAFRAME = model.createResource(SDTH_BASE_URI + "DataframeInstance");
dfURI.addProperty(RDF.type, SDTH_DATAFRAME);
String label = dfInstance.getLabel();
dfURI.addProperty(RDFS.label, label);
}

public static void handleVariableInstance(Model model, VariableInstance varInstance) {
// Create VariableInstance URI, type, label
String id = varInstance.getId();
Resource varURI = model.createResource(TREVAS_BASE_URI + "variable/" + id);
Resource SDTH_VARIABLE = model.createResource(SDTH_BASE_URI + "VariableInstance");
varURI.addProperty(RDF.type, SDTH_VARIABLE);
String label = varInstance.getLabel();
varURI.addProperty(RDFS.label, label);
}

public static Model initModel(String baseFilePath) {
Expand All @@ -87,4 +132,14 @@ public static void loadModelWithCredentials(Model model,
connection.close();
}
}

public static void writeJsonLdToFile(Model model, String path) throws IOException {
model.write(Files.newOutputStream(Paths.get(path)), "JSON-LD");
}

public static String serialize(Model model, String format) {
StringWriter stringWriter = new StringWriter();
model.write(stringWriter, format);
return stringWriter.toString();
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,27 @@
package fr.insee.vtl.prov.prov;

public class DataframeInstance {
String id;
String label;

public DataframeInstance(String id, String label) {
this.id = id;
this.label = label;
}

public String getId() {
return id;
}

public void setId(String id) {
this.id = id;
}

public String getLabel() {
return label;
}

public void setLabel(String label) {
this.label = label;
}
}
17 changes: 12 additions & 5 deletions vtl-prov/src/main/java/fr/insee/vtl/prov/prov/Program.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ public class Program {

String id;
String label;
Set<String> programStepIds = new HashSet<>();
Set<ProgramStep> programSteps = new HashSet<>();

String sourceCode;

Expand Down Expand Up @@ -35,12 +35,12 @@ public void setLabel(String label) {
this.label = label;
}

public Set<String> getProgramStepIds() {
return programStepIds;
public Set<ProgramStep> getProgramSteps() {
return programSteps;
}

public void setProgramSteps(Set<String> programStepIds) {
this.programStepIds = programStepIds;
public void setProgramSteps(Set<ProgramStep> programSteps) {
this.programSteps = programSteps;
}

public String getSourceCode() {
Expand All @@ -50,4 +50,11 @@ public String getSourceCode() {
public void setSourceCode(String sourceCode) {
this.sourceCode = sourceCode;
}

public ProgramStep getProgramStepById(String id) {
return programSteps.stream()
.filter(p -> p.getId().equals(id))
.findFirst()
.orElse(null);
}
}
Loading

0 comments on commit 37b49db

Please sign in to comment.