Skip to content

Commit

Permalink
refined existing extractors, re-added BioPortal extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
k00ni committed Apr 16, 2024
1 parent a67b02c commit fc3ab7b
Show file tree
Hide file tree
Showing 10 changed files with 352 additions and 80 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ You can find the index file here: [**index.csv**](./index.csv) (**3000+** ontolo

### Requirements and rules for an ontology-entry in the index

An RDF ontology/vocabulary is part of the index if it meets the following requirements:
An RDF ontology / vocabulary is part of the index if it meets the following criteria:
* non-empty, valid title
* non-empty, valid URI
* non-empty, valid IRI
* at least one valid URL to a RDF file
* at least one RDFS/OWL class is defined or at least one instance of owl:Ontology is found

If an entry is part of multiple sources (e.g. LOV and DBpedia Archivo), the one which appears first is taken.

Expand Down Expand Up @@ -150,11 +151,7 @@ But if you wanna help out with one of the topics, feel free to open an issue or
* [x] license
* [x] authors + contributors
* [x] project page / homepage
* [ ] data source url
* [ ] check prior versions of an ontology to avoid adding the same ontology just with different versions
* [ ] http vs https
* [ ] / vs # at the end
* [ ] mark entries if they contain SKOS entries
* [x] data source url

### Version 0.2

Expand All @@ -164,14 +161,17 @@ But if you wanna help out with one of the topics, feel free to open an issue or
* [ ] https://obofoundry.org/
* [ ] http://www.oegov.us/
* [ ] http://ontologydesignpatterns.org/wiki/Main\_Page
* [ ] https://obofoundry.org/
* [ ] https://github.com/linkeddata/ontology-archiver
* [ ] crawl Github repositories tagged with "ontology" etc.
* [ ] harmonize datetime information for latest access (all UTC?)
* [ ] add basic schema/ontology describing the fields in index.csv
* [ ] add a way to manually provide entries via Github
* [ ] Ping service: on update call a list of URLs to let them know that there was a change
* [ ] generate statistics for each service read to build index.csv (contains number of entries etc.)
* [ ] check prior versions of an ontology to avoid adding the same ontology just with different versions
* [ ] http vs https
* [ ] / vs # at the end
* [ ] mark entries if they contain SKOS entries

## License

Expand Down
2 changes: 1 addition & 1 deletion scripts/bin/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
'http://xmlns.com/foaf/0.1/name',
];

$bioPortalApiKeyFile = ROOT_DIR_PATH.DIRECTORY_SEPARATOR.'scripts'.DIRECTORY_SEPARATOR.'.api_key_bioportal.php';
define('BIOPORTAL_API_KEY_FILE', ROOT_DIR_PATH.DIRECTORY_SEPARATOR.'scripts'.DIRECTORY_SEPARATOR.'.api_key_bioportal.php');

// include vendor libraries
require_once SCRIPTS_DIR_PATH.'vendor'.DIRECTORY_SEPARATOR.'autoload.php';
2 changes: 2 additions & 0 deletions scripts/bin/renew_index.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
(new LinkedOpenVocabularies($cache, $dataFactory, $temporaryIndex))->run();
(new DBpediaArchivo($cache, $dataFactory, $temporaryIndex))->run();
(new OntologyLookupService($cache, $dataFactory, $temporaryIndex))->run();
(new BioPortal($cache, $dataFactory, $temporaryIndex))->run();
return;

// finalize temporary index and write index.csv
(new MergeInManuallyMaintainedMetadata($cache, $dataFactory, $temporaryIndex))->run();
Expand Down
32 changes: 25 additions & 7 deletions scripts/src/Cache.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Cache
*/
private array $caches = [];

private string $filesFolder = __DIR__.'/../var/downloaded_rdf_files/';

private function getCacheInstance(string $namespace): AbstractAdapter
{
if (false === isset($this->caches[$namespace])) {
Expand All @@ -25,19 +27,35 @@ private function getCacheInstance(string $namespace): AbstractAdapter
return $this->caches[$namespace];
}

private function createSimplifiedFilename(string $fileUrl): string
{
return preg_replace('/[^a-z0-9\-_]/ism', '_', $fileUrl);
}

/**
* @return non-empty-string
*/
public function getCachedFilePathForFileUrl(string $fileUrl): string
{
$fileRes = $this->getLocalFileResourceForFileUrl($fileUrl);

if (is_resource($fileRes)) {
// generate simplified filename for local storage
return $this->filesFolder.$this->createSimplifiedFilename($fileUrl);
} else {
throw new Exception('Got no file resource for '.$fileUrl);
}
}

/**
* @return resource|false Return value of fopen(..., 'r')
*
* @throws \Exception in case of an CURL error
*/
public function getLocalFileResourceForFileUrl(string $fileUrl)
{
$filesFolder = __DIR__.'/../var/downloaded_rdf_files/';

// generate simplified filename for local storage
$filename = preg_replace('/[^a-z0-9\-_]/ism', '_', $fileUrl);

$filepath = $filesFolder.$filename;
$filename = $this->createSimplifiedFilename($fileUrl);
$filepath = $this->filesFolder.$filename;

echo PHP_EOL.$fileUrl.' >> '.$filename;

Expand Down Expand Up @@ -80,7 +98,7 @@ public function getLocalFileResourceForFileUrl(string $fileUrl)
public function sendCachedRequest(string $url, string $namespace): string
{
$cache = $this->getCacheInstance($namespace);
$key = (string) preg_replace('/[\W]/', '_', $url);
$key = $this->createSimplifiedFilename($url);

// ask cache for entry
// if there isn't one, run HTTP request and return response content
Expand Down
121 changes: 71 additions & 50 deletions scripts/src/Extractor/AbstractExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,13 @@ abstract public function run(): void;
public function addFurtherMetadata(IndexEntry $indexEntry, Graph $graph): void
{
// short description / summary
$properties = ['skos:definition', 'dc11:description', 'dc:description', 'rdfs:comment'];
$properties = [
'skos:definition',
'dc11:description',
'dc:description',
'rdfs:comment',
'schema:description',
];
$valuesString = $this->getLiteralValuesAsString($graph, $properties, (string) $indexEntry->getOntologyIri());
$valuesString = $this->cleanString($valuesString);
if (false === isEmpty($valuesString)) {
Expand All @@ -64,7 +70,7 @@ public function addFurtherMetadata(IndexEntry $indexEntry, Graph $graph): void

// license
$valuesString = null;
foreach (['dc:license', 'dc11:rights'] as $prop) {
foreach (['dc:license', 'dc11:rights', 'schema:license'] as $prop) {
$valuesString = $this->getLiteralValuesAsString($graph, [$prop], (string) $indexEntry->getOntologyIri(), ' ', true);
$valuesString = $this->getAlignedLicenseInformation($valuesString);

Expand Down Expand Up @@ -93,7 +99,7 @@ public function addFurtherMetadata(IndexEntry $indexEntry, Graph $graph): void
}

// project page / homepage
$properties = ['foaf:homepage', 'schema:WebSite', 'schema:url'];
$properties = ['foaf:homepage', 'schema:WebSite', 'schema:url', 'rdfs:seeAlso'];
$valuesString = $this->getLiteralValuesAsString($graph, $properties, (string) $indexEntry->getOntologyIri());
$valuesString = $this->cleanString($valuesString);
if (false === isEmpty($valuesString)) {
Expand All @@ -103,15 +109,15 @@ public function addFurtherMetadata(IndexEntry $indexEntry, Graph $graph): void
/*
* latest access (latest file)
*/
$properties = ['dc:modified', 'dc11:modified'];
$properties = ['dc:modified', 'dc11:modified', 'schema:dateModified'];
foreach ($properties as $prop) {
$values = $graph->resource($indexEntry->getOntologyIri())->allLiterals($prop);

// create a list of datetime strings
$values = array_map(function ($value) {
if ($value instanceof DateTime || $value instanceof Date) {
return $value->format('Y-m-d');
} else {
} elseif(1 === preg_match('/[0-9]{4}\-[0-9]{2}\-[0-9]{4}/', $value->getValue())) {
return $value->getValue();
}
}, $values);
Expand Down Expand Up @@ -194,34 +200,6 @@ protected function getLiteralValuesAsString(
return $valuesString;
}

/**
* @param non-empty-string $fileUrl
*
* @throws \Exception
*/
public function guessFormatOnFile(string $fileUrl): string|null
{
$fileHandle = $this->cache->getLocalFileResourceForFileUrl($fileUrl);
if (false === is_resource($fileHandle)) {
throw new Exception('Could not open related file for '.$fileUrl);
}

$lengthInMb = 1024 * 100;
$str = (string) fread($fileHandle, $lengthInMb);

fclose($fileHandle);

$format = Format::guessFormat($str)?->getName() ?? null;
if (null == $format) {
// it only uses the first 1024 bytes, ... try with more bytes
if (str_contains($str, '<rdf:')) {
$format = 'rdfxml';
}
}

return $format;
}

/**
* This part could be done more easily with AsEasyRdf::AsEasyRdf, but some ontologies
* contain triples, which have object values that don't correspond with their data types.
Expand Down Expand Up @@ -328,6 +306,47 @@ protected function getAlignedLicenseInformation(string $value): string
}
}

/**
* @param non-empty-string $fileUrl
*
* @throws \Exception
*/
public function guessFormatOnFile(string $fileUrl): string|null
{
try {
$fileHandle = $this->cache->getLocalFileResourceForFileUrl($fileUrl);
} catch (Throwable $th) {
if (
str_contains($th->getMessage(), 'HTTP/1.1 403 Forbidden')
|| str_contains($th->getMessage(), 'HTTP/1.1 504 Gateway Time-out')
) {
echo PHP_EOL.$th->getMessage();
return null;
} else {
throw $th;
}
}

if (false === is_resource($fileHandle)) {
throw new Exception('Could not open related file for '.$fileUrl);
}

$lengthInMb = 1024 * 100;
$str = (string) fread($fileHandle, $lengthInMb);

fclose($fileHandle);

$format = Format::guessFormat($str)?->getName() ?? null;
if (null == $format) {
// it only uses the first 1024 bytes, ... try with more bytes
if (str_contains($str, '<rdf:')) {
$format = 'rdfxml';
}
}

return $format;
}

/**
* Loads the content of a given RDF file into an EasyRdf Graph instance.
*
Expand All @@ -338,20 +357,17 @@ protected function getAlignedLicenseInformation(string $value): string
*
* @throws \Throwable
*/
protected function loadQuadsIntoEasyRdfGraph(
$fileHandle,
string $rdfFileUrl,
string|null $format = null
): Graph {
$maxAmountOfTriples = 10000;
protected function loadQuadsIntoEasyRdfGraph($fileHandle, string $localFilePath): Graph
{
$maxAmountOfTriples = 50000;

try {
/*
* use quickRdfIo's Util::parse
*/
$i = 0;
$list = [];
foreach (Util::parse($fileHandle, $this->dataFactory, $format) as $quad) {
foreach (Util::parse($fileHandle, $this->dataFactory) as $quad) {
$list[] = $quad;
if ($i++ > $maxAmountOfTriples) {
break;
Expand All @@ -364,18 +380,11 @@ protected function loadQuadsIntoEasyRdfGraph(
|| str_contains($th->getMessage(), 'on line')
) {
echo PHP_EOL.' - quickRdfIo failed, trying rapper'.PHP_EOL;
/*
* use rapper command to read the RDF file and return nquads
*/
if (isEmpty($format)) {
// FYI: https://librdf.org/raptor/rapper.html
$format = '--guess';
} else {
$format = '-i '.substr((string) $format, 0, 20);
}

$command = 'rapper '.$format.' -o ntriples '.$rdfFileUrl;
// build and execute command using system shell
$command = 'rapper --guess -o ntriples '.$localFilePath;
$nquads = (string) shell_exec($command);

// limit amount of entries
$triples = explode(PHP_EOL, $nquads);
$triples = array_slice($triples, 0, $maxAmountOfTriples);
Expand All @@ -386,4 +395,16 @@ protected function loadQuadsIntoEasyRdfGraph(
}
}
}

/**
* Checks if ontology file contains elements of a certain type.
*/
protected function ontologyFileContainsElementsOfCertainTypes(Graph $graph): bool
{
return
0 < count($graph->allOfType('owl:Ontology'))
|| 0 < count($graph->allOfType('owl:Class'))
|| 0 < count($graph->allOfType('rdfs:Class'))
;
}
}
Loading

0 comments on commit fc3ab7b

Please sign in to comment.