|
| 1 | +from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder |
| 2 | +from shexer.io.graph.yielder.multifile_base_triples_yielder import MultifileBaseTripleYielder |
| 3 | +from shexer.utils.triple_yielders import tune_subj, tune_prop, tune_token |
| 4 | +from shexer.utils.uri import remove_corners |
| 5 | +import tempfile |
| 6 | +from pathlib import Path |
| 7 | +import re |
| 8 | +from shexer.utils.exception import ParseError |
| 9 | + |
| 10 | +WHITESPACES= re.compile(r"\s+") |
| 11 | + |
| 12 | +import lightrdf |
| 13 | + |
| 14 | + |
| 15 | +class LightTurtleTriplesYielder(BaseTriplesYielder): |
| 16 | + |
| 17 | + def __init__(self, source_file, raw_graph, namespaces_dict): |
| 18 | + super().__init__() |
| 19 | + self._prefixes = {} |
| 20 | + self._source_file = source_file |
| 21 | + self._raw_graph = raw_graph |
| 22 | + self._namespaces_dict = namespaces_dict if namespaces_dict is not None else {} |
| 23 | + self._yielded_triples = 0 |
| 24 | + |
| 25 | + def _yield_triples(self): |
| 26 | + self._extract_prefixes() |
| 27 | + parser = lightrdf.turtle.Parser() |
| 28 | + try: |
| 29 | + for s, p, o in parser.parse(self._source_file, base_iri=None): |
| 30 | + yield ( |
| 31 | + tune_subj(s), |
| 32 | + tune_prop(p), |
| 33 | + tune_token(o) |
| 34 | + ) |
| 35 | + self._yielded_triples += 1 |
| 36 | + except BaseException as e: |
| 37 | + raise ParseError(f"Error while parsing: {e}") from e |
| 38 | + |
| 39 | + def yield_triples(self): |
| 40 | + if self._raw_graph is not None: |
| 41 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 42 | + self._source_file = Path(tmpdir) / "data.nt" |
| 43 | + self._source_file.write_text(self._raw_graph, encoding="utf-8") |
| 44 | + self._source_file = str(self._source_file) |
| 45 | + for a_triple in self._yield_triples(): |
| 46 | + yield a_triple |
| 47 | + else: |
| 48 | + for a_triple in self._yield_triples(): |
| 49 | + yield a_triple |
| 50 | + |
| 51 | + @property |
| 52 | + def yielded_triples(self): |
| 53 | + return self._yielded_triples |
| 54 | + |
| 55 | + @property |
| 56 | + def error_triples(self): # No error triples in this parser, it crashes when finding an error |
| 57 | + return 0 |
| 58 | + |
| 59 | + def _extract_prefixes(self): |
| 60 | + with open(self._source_file, "r", encoding="utf-8") as f: |
| 61 | + for line in f: |
| 62 | + line = line.strip() |
| 63 | + line = WHITESPACES.sub(" ", line) |
| 64 | + if line.startswith("@prefix"): |
| 65 | + self._process_prefix_line(line) |
| 66 | + else: # if declarations are not consecutive and at the beginning, it will not work |
| 67 | + break |
| 68 | + |
| 69 | + def _process_prefix_line(self, line): |
| 70 | + pieces = line.split(" ") |
| 71 | + prefix = pieces[1] if not pieces[1].endswith(":") else pieces[1][: - 1] |
| 72 | + base_url = remove_corners(pieces[2]) |
| 73 | + if base_url not in self._namespaces_dict: |
| 74 | + self._namespaces_dict[base_url] = prefix |
| 75 | + |
| 76 | +class MultiLightTurtleTriplesYielder(MultifileBaseTripleYielder): |
| 77 | + def __init__(self, list_of_files, namespaces_dict): |
| 78 | + super(MultiLightTurtleTriplesYielder, self).__init__( |
| 79 | + list_of_files=list_of_files, |
| 80 | + namespaces_to_ignore=None, |
| 81 | + allow_untyped_numbers=False, |
| 82 | + compression_mode=None, |
| 83 | + zip_base_archive=None) |
| 84 | + self._namespaces_dict = namespaces_dict |
| 85 | + |
| 86 | + def _constructor_file_yielder(self, a_source_file, parse_namespaces=False): |
| 87 | + return LightTurtleTriplesYielder(source_file=a_source_file, |
| 88 | + namespaces_dict=self._namespaces_dict, |
| 89 | + raw_graph=None) |
| 90 | + |
0 commit comments