From c275ab1f1a02b942d620e630024ced2cd3a51efd Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Mon, 1 Jul 2024 12:39:08 +0200 Subject: [PATCH 1/5] update dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3fb8e34..bc75693 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,5 +22,5 @@ RUN poetry config virtualenvs.create false RUN poetry install --no-dev && rm pyproject.toml -CMD python3 -m proxy --hostname 0.0.0.0 --port $PORT --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin +CMD python3 -m proxy --ca-key-file ca-key.pem --ca-cert-file ca-cert.pem --ca-signing-key-file ca-signing-key.pem --hostname 0.0.0.0 --port $PORT --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin From 3658201a2c603e74a4d09ffaae0f796b12bbad5c Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 2 Jul 2024 14:20:54 +0200 Subject: [PATCH 2/5] fixing comments from Johannes --- README.md | 133 +--------------------------- ontologytimemachine/custom_proxy.py | 47 +++------- ontologytimemachine/utils/utils.py | 107 +++++----------------- tests/test_proxy.py | 107 ++++++++++++---------- 4 files changed, 93 insertions(+), 301 deletions(-) diff --git a/README.md b/README.md index 51247f5..8a33f66 100644 --- a/README.md +++ b/README.md @@ -22,142 +22,15 @@ cp ca-signing-key.pem ~/ontology-time-machine/ca-signing-key.pem ### Curl tests: - curl -x http://0.0.0.0:8899 --cacert ca-cert.pem http://www.google.com -- curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://linked-web-apis.fit.cvut.cz/ns/core - curl -x http://0.0.0.0:8899 --cacert ca-cert.pem https://www.w3id.org/simulation/ontology/ - curl -x http://0.0.0.0:8899 --cacert ca-cert.pem https://www.w3.org/ns/ldt# - curl -x http://0.0.0.0:8899 --cacert ca-cert.pem https://raw.githubusercontent.com/br0ast/simulationontology/main/Ontology/simulationontology.owl - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://bblfish.net/work/atom-owl/2006-06-06/ - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://purl.org/makolab/caont/ - - -### Not working: - curl -x http://0.0.0.0:8899 --cacert ca-cert.pem https://vocab.eccenca.com/auth/ +- curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://dbpedia.org/ontology/Person - - -from proxy.http.proxy import HttpProxyBasePlugin -from proxy.http.parser import HttpParser, httpParserTypes -from proxy.common.utils import build_http_response -from proxy.http.methods import HttpMethods -from ontologytimemachine.utils.utils import proxy_logic_http, proxy_logic_https -from ontologytimemachine.utils.utils import check_if_archivo_ontology_requested -from ontologytimemachine.utils.utils import get_headers_and_expected_type -from requests.exceptions import SSLError, Timeout, ConnectionError, RequestException -from http.client import responses -import proxy -import sys -import requests -import logging - - -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - - -class OntologyTimeMachinePlugin(HttpProxyBasePlugin): - #def __init__(self, *args, **kwargs): - # super().__init__(*args, **kwargs) - - - def before_upstream_connection(self, request: HttpParser): - logger.debug('Before upstream') - print(request.method) - scheme = 'https' if request.method == b'CONNECT' else 'http' - if scheme == 'https': - logger.debug('The request is HTTPS, forward as it is') - return request - - ontology_request = check_if_archivo_ontology_requested(request) - if ontology_request: - logger.debug('The request is for an ontology') - try: - ontology_url = str(request._url) - headers, _ = get_headers_and_expected_type(request) - response = requests.get(ontology_url, headers=headers) - if response.status_code == 502: - logger.error('Received 502 Bad Gateway error') - response = proxy_logic_http(request) - logger.debug('Queue response') - self.queue_response(response) - return None - else: - logger.debug('The request is correct') - return request - except (SSLError, Timeout, ConnectionError, RequestException) as e: - logger.error(f'Network-related exception occurred {e}') - response = proxy_logic_http(request) - logger.debug('Queue response') - self.queue_response(response) - return None - return request - - - def handle_client_request(self, request: HttpParser): - logger.debug('HTTP call') - logger.debug(request._url) - - ontology_request = check_if_archivo_ontology_requested(request) - if not ontology_request: - logger.info('No ontology is asked, forward original request') - return request - - response = proxy_logic_http(request) - self.queue_response(response) - - return None - - - def handle_upstream_chunk(self, chunk: memoryview): - logger.info('HTTPS call') - - try: - # Parse the HTTP response to handle different cases - parser = HttpParser(httpParserTypes.RESPONSE_PARSER) - parser.parse(memoryview(chunk)) - code = int(parser.code.decode('utf-8')) - if code >= 100 and code < 200: - return chunk - elif code >= 201 and code <= 204: - return chunk - elif code == 451: - return chunk - else: - response = proxy_logic_https(parser) - logger.debug('Queue response') - self.queue_response(response) - return None - except UnicodeDecodeError: - logger.warning('Received non-text chunk, cannot decode') - except Exception as e: - logger.error(f'Exception occurred while handling upstream chunk: {e}') - - return chunk - - def queue_response(self, response): - self.client.queue( - build_http_response( - response.status_code, - reason=bytes(responses[response.status_code], 'utf-8'), - headers={ - b'Content-Type': bytes(response.headers.get('Content-Type'), 'utf-8') - }, - body=response.content - ) - ) - - -if __name__ == '__main__': - sys.argv += [ - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', - ] - sys.argv += [ - '--hostname', '0.0.0.0', - '--port', '8899', - '--plugins', __name__ + '.OntologyTimeMachinePlugin' - ] - logger.info("Starting OntologyTimeMachineProxy server...") - proxy.main() \ No newline at end of file +### Not working: +- curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index c3ee1f4..28b52f0 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -2,7 +2,7 @@ from proxy.http.parser import HttpParser, httpParserTypes from proxy.common.utils import build_http_response from proxy.http.methods import HttpMethods -from ontologytimemachine.utils.utils import proxy_logic_http, proxy_logic_https +from ontologytimemachine.utils.utils import proxy_logic from ontologytimemachine.utils.utils import check_if_archivo_ontology_requested from ontologytimemachine.utils.utils import get_headers_and_expected_type from ontologytimemachine.utils.utils import get_ontology_from_request @@ -18,74 +18,51 @@ PORT = '8899' -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) - class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def before_upstream_connection(self, request: HttpParser): - logger.debug('Before upstream') - logger.debug(request.method) + logger.info('Before upstream connection hook') + logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') + scheme = 'https' if request.method == b'CONNECT' else 'http' if scheme == 'https': - logger.debug('The request is HTTPS, forward as it is') - logger.debug(f'Request host: {request.host}') - logger.debug(f'Request path: {request.path}') return request ontology_request = check_if_archivo_ontology_requested(request) if ontology_request: logger.debug('The request is for an ontology') - try: - ontology_url = str(request._url) - headers, _ = get_headers_and_expected_type(request) - response = requests.get(ontology_url, headers=headers, timeout=5) - if response.status_code == 502: - logger.error('Received 502 Bad Gateway error') - response = proxy_logic_http(request) - logger.debug('Queue response') - self.queue_response(response) - return None - else: - logger.debug('The request is correct') - return request - except (SSLError, Timeout, ConnectionError, RequestException) as e: - logger.error(f'Network-related exception occurred {e}') - response = proxy_logic_http(request) - logger.debug('Queue response') - self.queue_response(response) - return None + response = proxy_logic(request) + self.queue_response(response) + return None return request def handle_client_request(self, request: HttpParser): - logger.debug('HTTP call') + logger.info('Handle client request hook') + logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') logger.debug(request.method) scheme = 'https' if request.method == b'CONNECT' else 'http' if scheme == 'https': - logger.debug('The request is HTTPS, forward as it is') return request ontology_request = check_if_archivo_ontology_requested(request) if not ontology_request: - logger.info('No ontology is asked, forward original request') + logger.info('The requested IRI is not part of DBpedia Archivo') return request - logger.debug('Call proxy logic') - response = proxy_logic_http(request) + response = proxy_logic(request) self.queue_response(response) return None def handle_upstream_chunk(self, chunk: memoryview): - logger.info('HTTPS call') - return chunk diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index db05205..05066a2 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -6,37 +6,13 @@ import requests -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) dbpedia_api = 'https://archivo.dbpedia.org/download' -ontology_types = [ - 'application/turtle', # Turtle (Terse RDF Triple Language) - 'text/turtle', # Turtle (alternative media type) - 'application/rdf+xml', # RDF/XML - 'text/rdf+xml', # RDF/XML (alternative media type) - 'application/n-triples', # N-Triples - 'text/n-triples', # N-Triples (alternative media type) - 'application/n-quads', # N-Quads - 'text/n-quads', # N-Quads (alternative media type) - 'application/ld+json', # JSON-LD (JSON for Linking Data) - 'application/trig', # TriG - 'application/sparql-results+json', # SPARQL Query Results JSON - 'application/json', # JSON (alternative for SPARQL Query Results JSON) - 'application/sparql-results+xml', # SPARQL Query Results XML - 'text/xml' # XML (alternative for SPARQL Query Results XML) -] - - -format_mapping = { - 'application/turtle': 'ttl', # Turtle (Terse RDF Triple Language) - 'text/turtle': 'ttl', # Turtle (alternative media type) -} - - passthrough_status_codes_http = [ 100, 101, 102, 103, 200, @@ -46,12 +22,15 @@ def check_if_archivo_ontology_requested(request): - urls = [] with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: urls = [line.strip() for line in file] - parsed_urls = [urlparse(url).netloc for url in urls] - _, host = get_ontology_from_request(request) - return host in parsed_urls + parsed_urls = [(urlparse(url).netloc, urlparse(url).path) for url in urls] + + _, request_host, request_path = get_ontology_from_request(request) + for host, path in parsed_urls: + if request_host == host and request_path.startswith(path): + return True + return False def mock_response_404(): @@ -65,12 +44,9 @@ def mock_response_404(): def get_headers_and_expected_type(request): headers = {} - expected_type = 'text/turtle' for k, v in request.headers.items(): headers[v[0].decode('utf-8')] = v[1].decode('utf-8') - if v[0].decode('utf-8') == 'Accept': - expected_type = v[1].decode('utf-8') - return headers, expected_type + return headers def get_ontology_from_request(request): @@ -80,28 +56,28 @@ def get_ontology_from_request(request): if v[0].decode('utf-8') == 'Host': print('host found') host = v[1].decode('utf-8') + path = request.path.decode('utf-8') ontology = 'https://' + host + request.path.decode('utf-8') else: host = request.host.decode('utf-8') + path = request.path.decode('utf-8') ontology = str(request._url) logger.debug(f'Ontology: {ontology}') - return ontology, host + return ontology, host, path -def proxy_logic_http(request: HttpParser): - logger.info('Start proxy logic in case of HTTP') - response = failover_mode_http(request) +def proxy_logic(request: HttpParser): + logger.info('Proxy has to intervene') + response = failover_mode(request) return response -def failover_mode_http(request): - headers, _ = get_headers_and_expected_type(request) - logger.debug(headers) - logger.debug('Failover mode') +def failover_mode(request): + headers = get_headers_and_expected_type(request) + logger.info('Failover mode') - ontology, _ = get_ontology_from_request(request) + ontology, _, _ = get_ontology_from_request(request) try: response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info(f' Status code: {response.status_code}') if response.history: logger.debug("Request was redirected") for resp in response.history: @@ -111,54 +87,12 @@ def failover_mode_http(request): logger.debug("Request was not redirected") content_type = response.headers.get('Content-Type') logger.debug(content_type) - if response.status_code in passthrough_status_codes_http and content_type in ontology_types: - logging.info(f'We found the right answer') - return response - else: - logging.info('Content type is not as expected') - return fetch_from_dbpedia_archivo_api(ontology) - except (SSLError, Timeout, ConnectionError, RequestException) as e: - #logger.error("Request failed:", e) - return fetch_from_dbpedia_archivo_api(ontology) - - -def proxy_logic_https(parser): - logger.info('Start proxy logic in case of HTTPS') - response = failover_mode_https(parser) - return response - - -def failover_mode_https(parser): - code = int(parser.code.decode('utf-8')) - if code >= 300 and code < 400: - logger.info('Status code: 3XX') - # Handle redirection - redirect_url = parser.header(b'Location').decode('utf-8') - logger.info(f'Redirected to {redirect_url}') - response = get_data_from_redirect(redirect_url) - return response - - -def get_data_from_redirect(ontology): - try: - response = requests.get(url=ontology, timeout=5) - logger.info(response.status_code) - if response.history: - logger.debug("Request was redirected") - for resp in response.history: - logger.debug(f"{resp.status_code}, {resp.url}") - logger.debug(f"Final destination: {response.status_code}, {response.url}") - else: - logger.debug("Request was not redirected") - content_type = response.headers.get('Content-Type') if response.status_code in passthrough_status_codes_http: - logging.info(f'We found the rights answer') return response else: - logging.info('Content type is not as expected') + logging.info(f'Status code: {response.status_code}') return fetch_from_dbpedia_archivo_api(ontology) except (SSLError, Timeout, ConnectionError, RequestException) as e: - #logger.error("Request failed:", e) return fetch_from_dbpedia_archivo_api(ontology) @@ -167,7 +101,6 @@ def fetch_from_dbpedia_archivo_api(ontology: str, format: str = 'ttl'): try: logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') response = requests.get(dbpedia_url, timeout=5) - logger.debug('Response received') return response except requests.exceptions.RequestException as e: logging.error(f'Exception occurred while fetching from DBpedia Archivo API: {e}') diff --git a/tests/test_proxy.py b/tests/test_proxy.py index d370ac1..34dde2f 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -42,79 +42,84 @@ def start_proxy_server(): process.wait() -#def test_babelnet(): -# iri = 'http://babelnet.org/rdf/' -# generic_test(iri, 'text/turtle') +def test_babelnet(): + iri = 'http://babelnet.org/rdf/' + generic_test(iri, 'text/turtle') -#def test_bag_basisregistraties(): -# iri = 'http://bag.basisregistraties.overheid.nl/def/bag' -# generic_test(iri, 'text/turtle') +def test_bag_basisregistraties(): + iri = 'http://bag.basisregistraties.overheid.nl/def/bag' + generic_test(iri, 'text/turtle') -#def test_bblfish(): -# iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' -# generic_test(iri, 'text/turtle') +def test_bblfish(): + iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' + generic_test(iri, 'text/turtle') -#def test_brk_basisregistraties(): -# iri = 'http://brk.basisregistraties.overheid.nl/def/brk' -# generic_test(iri, 'text/turtle') +def test_brk_basisregistraties(): + iri = 'http://brk.basisregistraties.overheid.nl/def/brk' + generic_test(iri, 'text/turtle') -#def test_brt_basisregistraties(): -# iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' -# generic_test(iri, 'text/turtle') +def test_brt_basisregistraties(): + iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' + generic_test(iri, 'text/turtle') -#def test_brt_basisregistraties_begrippenkader(): -# iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' -# generic_test(iri, 'text/turtle') +def test_brt_basisregistraties_begrippenkader(): + iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' + generic_test(iri, 'text/turtle') -#def test_buzzword(): -# iri = 'http://buzzword.org.uk/rdf/personal-link-types#' -# generic_test(iri, 'text/turtle') +def test_buzzword(): + iri = 'http://buzzword.org.uk/rdf/personal-link-types#' + generic_test(iri, 'text/turtle') -#def test_catalogus_professorum(): -# iri = 'http://catalogus-professorum.org/cpm/2/' -# generic_test(iri, 'text/turtle') +def test_catalogus_professorum(): + iri = 'http://catalogus-professorum.org/cpm/2/' + generic_test(iri, 'text/turtle') -#def test_data_gov(): -# iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' -# generic_test(iri, 'text/turtle') +def test_data_gov(): + iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' + generic_test(iri, 'text/turtle') -#def test_data_bigdatagrapes(): -# iri = 'http://data.bigdatagrapes.eu/resource/ontology/' -# generic_test(iri, 'text/turtle') +def test_data_bigdatagrapes(): + iri = 'http://data.bigdatagrapes.eu/resource/ontology/' + generic_test(iri, 'text/turtle') -#def test_data_europa_esco(): -# iri = 'http://data.europa.eu/esco/flow' -# generic_test(iri, 'text/turtle') +def test_data_europa_esco(): + iri = 'http://data.europa.eu/esco/flow' + generic_test(iri, 'text/turtle') -#def test_data_globalchange(): -# iri = 'http://data.globalchange.gov/gcis.owl' -# generic_test(iri, 'text/turtle') +def test_data_globalchange(): + iri = 'http://data.globalchange.gov/gcis.owl' + generic_test(iri, 'text/turtle') -#def test_data_ontotext(): -# iri = 'http://data.ontotext.com/resource/leak/' -# generic_test(iri, 'text/turtle') +def test_data_ontotext(): + iri = 'http://data.ontotext.com/resource/leak/' + generic_test(iri, 'text/turtle') -#def test_data_opendiscoveryspace(): -# iri = 'http://data.opendiscoveryspace.eu/lom_ontology_ods.owl#' -# generic_test(iri, 'text/turtle') +def test_data_opendiscoveryspace(): + iri = 'http://data.opendiscoveryspace.eu/lom_ontology_ods.owl#' + generic_test(iri, 'text/turtle') -#def test_data_ordnancesurvey_50kGazetteer(): -# iri = 'http://data.ordnancesurvey.co.uk/ontology/50kGazetteer/' -# generic_test(iri, 'text/turtle') +def test_data_ordnancesurvey_50kGazetteer(): + iri = 'http://data.ordnancesurvey.co.uk/ontology/50kGazetteer/' + generic_test(iri, 'text/turtle') + + +def test_data_ordnancesurvey_50kGazetteer(): + iri = 'http://dbpedia.org/ontology/Person' + generic_test(iri, 'text/turtle') def test_linked_web_apis(): @@ -122,9 +127,9 @@ def test_linked_web_apis(): generic_test(iri, 'text/turtle') -def test_ontologi_es(): - iri = 'http://ontologi.es/days#' - generic_test(iri, 'text/turtle') +#def test_ontologi_es(): +# iri = 'http://ontologi.es/days#' +# generic_test(iri, 'text/turtle') def test_https(): @@ -132,6 +137,11 @@ def test_https(): generic_test(iri, 'text/plain; charset=utf-8') +def test_https(): + iri = "https://vocab.eccenca.com/auth/" + generic_test(iri, 'text/plain; charset=utf-8') + + def not_test_all_iris(): with open('tests/archivo_ontologies_test.txt', 'r') as file: for line in file: @@ -143,7 +153,6 @@ def not_test_all_iris(): def generic_test(iri, content_type): response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 - assert response.headers['Content-Type'] == content_type assert iri in response.content.decode('utf-8') From 2b13d78b2eee004594ef9e17abd16db14b0048bf Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 5 Jul 2024 12:49:10 +0200 Subject: [PATCH 3/5] adjust the proxy logic based on the canvas --- README.md | 3 + ontologytimemachine/custom_proxy.py | 35 ++- ontologytimemachine/utils/mock_responses.py | 38 ++++ ontologytimemachine/utils/utils.py | 234 ++++++++++++++++++-- 4 files changed, 279 insertions(+), 31 deletions(-) create mode 100644 ontologytimemachine/utils/mock_responses.py diff --git a/README.md b/README.md index 8a33f66..09c2ac9 100644 --- a/README.md +++ b/README.md @@ -34,3 +34,6 @@ cp ca-signing-key.pem ~/ontology-time-machine/ca-signing-key.pem ### Not working: - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# + + +# https://archivo.dbpedia.org/download?o=http://linked-web-apis.fit.cvut.cz/ns/core&v=2020.07.16-115638&versionMatching=timeStampClosest \ No newline at end of file diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 28b52f0..95cf79f 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -2,15 +2,13 @@ from proxy.http.parser import HttpParser, httpParserTypes from proxy.common.utils import build_http_response from proxy.http.methods import HttpMethods -from ontologytimemachine.utils.utils import proxy_logic +from ontologytimemachine.utils.utils import proxy_logic, parse_arguments from ontologytimemachine.utils.utils import check_if_archivo_ontology_requested -from ontologytimemachine.utils.utils import get_headers_and_expected_type -from ontologytimemachine.utils.utils import get_ontology_from_request +from ontologytimemachine.utils.mock_responses import mock_response_403 from requests.exceptions import SSLError, Timeout, ConnectionError, RequestException from http.client import responses import proxy import sys -import requests import logging @@ -24,19 +22,34 @@ class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + (self.ontoFormat, self.ontoVersion, self.only_ontologies, + self.https_intercept, self.inspect_redirects, self.forward_headers, + self.subject_binary_search_threshold) = parse_arguments() + def before_upstream_connection(self, request: HttpParser): logger.info('Before upstream connection hook') logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') - scheme = 'https' if request.method == b'CONNECT' else 'http' - if scheme == 'https': - return request + if request.method == b'CONNECT': + logger.info(f'HTTPS interception mode: {self.https_intercept}') + # Only intercept if interception is enabled + if self.https_intercept in ['all', 'archivo']: + return request + else: + return None + ontology_request = check_if_archivo_ontology_requested(request) + # If only ontology mode, return None in all other cases + if self.only_ontologies and not ontology_request: + logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') + self.queue_response(mock_response_403) + return None + if ontology_request: logger.debug('The request is for an ontology') - response = proxy_logic(request) + response = proxy_logic(request, self.ontoFormat, self.ontoVersion) self.queue_response(response) return None return request @@ -47,8 +60,7 @@ def handle_client_request(self, request: HttpParser): logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') logger.debug(request.method) - scheme = 'https' if request.method == b'CONNECT' else 'http' - if scheme == 'https': + if request.method == b'CONNECT': return request ontology_request = check_if_archivo_ontology_requested(request) @@ -56,7 +68,7 @@ def handle_client_request(self, request: HttpParser): logger.info('The requested IRI is not part of DBpedia Archivo') return request - response = proxy_logic(request) + response = proxy_logic(request, self.ontoFormat, self.ontoVersion) self.queue_response(response) return None @@ -80,6 +92,7 @@ def queue_response(self, response): if __name__ == '__main__': + sys.argv += [ '--ca-key-file', 'ca-key.pem', '--ca-cert-file', 'ca-cert.pem', diff --git a/ontologytimemachine/utils/mock_responses.py b/ontologytimemachine/utils/mock_responses.py new file mode 100644 index 0000000..d6b0e11 --- /dev/null +++ b/ontologytimemachine/utils/mock_responses.py @@ -0,0 +1,38 @@ +import requests + + +def mock_response_200(): + mock_response = requests.Response() + mock_response.status_code = 200 + mock_response.url = 'https://example.com/success' + mock_response.headers['Content-Type'] = 'text/html' + mock_response._content = b'

To be implemented

' + return mock_response + + +def mock_response_403(): + mock_response = requests.Response() + mock_response.status_code = 403 + mock_response.url = 'https://example.com/forbidden' + mock_response.headers['Content-Type'] = 'text/html' + mock_response._content = b'

403 Forbidden

' + return mock_response + + + +def mock_response_404(): + mock_response = requests.Response() + mock_response.status_code = 404 + mock_response.url = 'https://example.com/resource-not-found' + mock_response.headers['Content-Type'] = 'text/html' + mock_response._content = b'

404 Not Found

' + return mock_response + + +def mock_response_500(): + mock_response = requests.Response() + mock_response.status_code = 500 + mock_response.url = 'https://example.com/internal-server-error' + mock_response.headers['Content-Type'] = 'text/html' + mock_response._content = b'

500 Internal Server Error

' + return mock_response \ No newline at end of file diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 05066a2..227b8eb 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -1,9 +1,12 @@ from proxy.http.parser import HttpParser, httpParserTypes from requests.exceptions import SSLError, Timeout, ConnectionError, RequestException +from ontologytimemachine.utils.mock_responses import mock_response_403, mock_response_404, mock_response_500, mock_response_200 from http.client import responses from urllib.parse import urlparse import logging import requests +import argparse +import mimetypes logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -20,6 +23,80 @@ 451, ] +def parse_arguments(): + parser = argparse.ArgumentParser(description='Process ontology format and version.') + + # Defining ontoFormat argument with nested options + parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], + default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') + + parser.add_argument('--ontoPrecedence', type=str, choices=['default', 'enforcedPriority', 'always'], + default='enforcedPriority', help='Precedence of the ontology: default, enforcedPriority, always') + + parser.add_argument('--patchAcceptUpstream', type=bool, default=False, + help='Defines if the Accept Header is patched upstream in original mode.') + + # Defining ontoVersion argument + parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', + 'latestArchive', 'timestampArchive', 'dependencyManifest'], + default='originalFailoverLive', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') + + # Enable/disable mode to only proxy requests to ontologies + parser.add_argument('--onlyOntologies', type=bool, default=False, + help='Enable/disable mode to only proxy requests to ontologies.') + + # Enable HTTPS interception for specific domains + parser.add_argument('--httpsIntercept', type=str, choices=['none', 'archivo', 'all', 'listfilename'], + default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') + + # Enable/disable inspecting or removing redirects + parser.add_argument('--inspectRedirects', type=bool, default=True, + help='Enable/disable inspecting or removing redirects.') + + # Enable/disable proxy forward headers + parser.add_argument('--forwardHeaders', type=bool, default=True, + help='Enable/disable proxy forward headers.') + + # SubjectBinarySearchThreshold + parser.add_argument('--subjectBinarySearchThreshold', type=int, default=100, + help='SubjectBinarySearchThreshold value.') + + # Proxy native parameters + parser.add_argument('--ca-key-file', type=str, required=True, + help='Path to the CA key file.') + + parser.add_argument('--ca-cert-file', type=str, required=True, + help='Path to the CA certificate file.') + + parser.add_argument('--ca-signing-key-file', type=str, required=True, + help='Path to the CA signing key file.') + + parser.add_argument('--hostname', type=str, required=True, + help='Hostname for the proxy server.') + + parser.add_argument('--port', type=int, required=True, + help='Port for the proxy server.') + + parser.add_argument('--plugins', type=str, required=True, + help='Plugins for the proxy server.') + + args = parser.parse_args() + + ontoFormat = { + 'format': args.ontoFormat, + 'precedence': args.ontoPrecedence, + 'patchAcceptUpstream': args.patchAcceptUpstream + } + + logger.info(f'Ontology Format: {ontoFormat}') + logger.info(f'Ontology Version: {args.ontoVersion}') + #logger.info(f'Only Ontologies Mode: {args.onlyOntologies}') + #logger.info(f'HTTPS Interception: {args.httpsIntercept}') + #logger.info(f'Inspect Redirects: {args.inspectRedirects}') + #logger.info(f'Forward Headers: {args.forwardHeaders}') + #logger.info(f'Subject Binary Search Threshold: {args.subjectBinarySearchThreshold}') + return ontoFormat, args.ontoVersion, args.onlyOntologies, args.httpsIntercept, args.inspectRedirects, args.forwardHeaders, args.subjectBinarySearchThreshold + def check_if_archivo_ontology_requested(request): with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: @@ -33,16 +110,7 @@ def check_if_archivo_ontology_requested(request): return False -def mock_response_404(): - mock_response = requests.Response() - mock_response.status_code = 404 - mock_response.url = 'https://example.com/resource-not-found' - mock_response.headers['Content-Type'] = 'text/html' - mock_response._content = b'

404 Not Found

' - return mock_response - - -def get_headers_and_expected_type(request): +def get_headers(request): headers = {} for k, v in request.headers.items(): headers[v[0].decode('utf-8')] = v[1].decode('utf-8') @@ -50,11 +118,10 @@ def get_headers_and_expected_type(request): def get_ontology_from_request(request): - logger.debug('Get ontology from request') + logger.info('Get ontology from request') if (request.method == b'GET' or request.method == b'HEAD') and not request.host: for k, v in request.headers.items(): if v[0].decode('utf-8') == 'Host': - print('host found') host = v[1].decode('utf-8') path = request.path.decode('utf-8') ontology = 'https://' + host + request.path.decode('utf-8') @@ -62,17 +129,115 @@ def get_ontology_from_request(request): host = request.host.decode('utf-8') path = request.path.decode('utf-8') ontology = str(request._url) - logger.debug(f'Ontology: {ontology}') + logger.info(f'Ontology: {ontology}') return ontology, host, path -def proxy_logic(request: HttpParser): + +def get_mime_type(format): + # Guess the MIME type based on the format + mime_type, _ = mimetypes.guess_type(f'file.{format}') + # Return the guessed MIME type or a generic default if guessing fails + return mime_type or 'text/turtle' + + +def set_onto_format_headers(request, ontoFormat, ontoVersion): + logger.info(f'Setting headers based on ontoFormat: {ontoFormat}') + + # Determine the correct MIME type for the format + mime_type = get_mime_type(ontoFormat['format']) + + # Check the precedence and update the 'Accept' header if necessary + if ontoFormat['precedence'] in ['always', 'enforcedPriority'] or \ + (ontoFormat['precedence'] == 'default' and b'accept' not in request.headers): + request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) + logger.info(f'Accept header set to: {request.headers[b"accept"][1]}') + + # Check if patchAcceptUpstream is true and ontoVersion is 'original' + if ontoFormat['patchAcceptUpstream'] and ontoVersion == 'original': + request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) + logger.info(f'Accept header patched upstream: {request.headers[b"accept"][1]}') + + +def proxy_logic(request: HttpParser, ontoFormat, ontoVersion): logger.info('Proxy has to intervene') - response = failover_mode(request) + set_onto_format_headers(request, ontoFormat, ontoVersion) + headers = get_headers(request) + logger.info(f'Updated headers: {request.headers}') + ontology, _, _ = get_ontology_from_request(request) + if ontoVersion == 'original': + response = fetch_original(ontology, headers) + elif ontoVersion == 'originalFailoverLive': + response = fetch_failover(ontology, headers, live=True) + elif ontoVersion == 'originalFailoverMonitor': + response = fetch_failover(ontology, headers, monitor=True) + elif ontoVersion == 'latestArchive': + response = fetch_latest_archive(ontology, headers) + elif ontoVersion == 'timestampArchive': + response = fetch_timestamp_archive(ontology, headers) + elif ontoVersion == 'dependencyManifest': + response = fetch_dependency_manifest(ontology, headers) + return response +# Fetch from the original source, no matter what +def fetch_original(ontology, headers): + logger.info(f'Fetching original ontology from URL: {ontology}') + try: + response = requests.get(url=ontology, headers=headers, timeout=5) + logger.info('Successfully fetched original ontology') + return response + except Exception as e: + logger.error(f'Error fetching original ontology: {e}') + return mock_response_500() + + +# Failover mode +def fetch_failover(ontology, headers, live=False, monitor=False): + try: + logger.info(f'Fetching original ontology with failover from URL: {ontology}') + response = requests.get(url=ontology, headers=headers, timeout=5) + logger.info('Successfully fetched original ontology') + if response.status_code in passthrough_status_codes_http: + return response + else: + logging.info(f'Status code: {response.status_code}') + return fetch_from_dbpedia_archivo_api(ontology, headers) + except Exception as e: + logger.error(f'Error fetching original ontology: {e}') + if live: + logger.info('Attempting to fetch live version due to failover') + return fetch_from_dbpedia_archivo_api(ontology, headers) + elif monitor: + logger.info('Attempting to fetch archive monitor version due to failover') + # TODO + return mock_response_404 + else: + return mock_response_500 + + +# Fetch the lates version from archivo (no timestamp defined) +def fetch_latest_archive(ontology, headers): + logger.info(f'Fetching latest archive ontology from URL: {ontology}/latest') + try: + response = requests.get(url=ontology, headers=headers, timeout=5) + logger.info('Successfully fetched latest archive ontology') + return response + except Exception as e: + logger.error(f'Error fetching latest archive ontology: {e}') + return mock_response_500 + + +def fetch_timestamp_archive(ontology, headers): + return mock_response_404 + + +def fetch_dependency_manifest(ontology, headers): + return mock_response_404 + + def failover_mode(request): - headers = get_headers_and_expected_type(request) + headers = get_headers(request) logger.info('Failover mode') ontology, _, _ = get_ontology_from_request(request) @@ -91,12 +256,13 @@ def failover_mode(request): return response else: logging.info(f'Status code: {response.status_code}') - return fetch_from_dbpedia_archivo_api(ontology) + return fetch_from_dbpedia_archivo_api(ontology, headers) except (SSLError, Timeout, ConnectionError, RequestException) as e: - return fetch_from_dbpedia_archivo_api(ontology) + return fetch_from_dbpedia_archivo_api(ontology, headers) -def fetch_from_dbpedia_archivo_api(ontology: str, format: str = 'ttl'): +def fetch_from_dbpedia_archivo_api(ontology, headers): + format, version, versionMatching = get_parameters_from_headers(headers) dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' try: logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') @@ -104,4 +270,32 @@ def fetch_from_dbpedia_archivo_api(ontology: str, format: str = 'ttl'): return response except requests.exceptions.RequestException as e: logging.error(f'Exception occurred while fetching from DBpedia Archivo API: {e}') - return mock_response_404() \ No newline at end of file + return mock_response_404() + + +def map_mime_to_format(mime_type): + # Use the mimetypes library to get the file extension + extension = mimetypes.guess_extension(mime_type) + if not extension: + return None + + # Map file extensions to formats + ext_to_format = { + '.rdf': 'owl', + '.xml': 'owl', + '.ttl': 'ttl', + '.nt': 'nt', + # Add more mappings if needed + } + + return ext_to_format.get(extension, None) + + +def get_parameters_from_headers(headers): + # Map MIME types to formats + mime_type = headers.get('Accept', None) + format = map_mime_to_format(mime_type) + + version = headers.get('Version', None) + versionMatching = headers.get('VersionMatching', None) + return format, version, versionMatching \ No newline at end of file From 21a53b1c44415dbad71d2fb360d67b109ade1024 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 5 Jul 2024 12:49:48 +0200 Subject: [PATCH 4/5] cleanup --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 09c2ac9..413afd3 100644 --- a/README.md +++ b/README.md @@ -35,5 +35,3 @@ cp ca-signing-key.pem ~/ontology-time-machine/ca-signing-key.pem ### Not working: - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# - -# https://archivo.dbpedia.org/download?o=http://linked-web-apis.fit.cvut.cz/ns/core&v=2020.07.16-115638&versionMatching=timeStampClosest \ No newline at end of file From 3903e7eb338b035514fe8d4fa3c15730d7afcebe Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 12 Jul 2024 12:29:23 +0200 Subject: [PATCH 5/5] add unittests for unitlity functions --- tests/test_unit.py | 145 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/test_unit.py diff --git a/tests/test_unit.py b/tests/test_unit.py new file mode 100644 index 0000000..f0f76e8 --- /dev/null +++ b/tests/test_unit.py @@ -0,0 +1,145 @@ +import unittest +from unittest.mock import patch, Mock +import argparse +import requests +from ontologytimemachine.utils.mock_responses import ( + mock_response_200, + mock_response_403, + mock_response_404, + mock_response_500 +) +from ontologytimemachine.utils.utils import ( + parse_arguments, + fetch_from_dbpedia_archivo_api, + map_mime_to_format, + get_parameters_from_headers +) + + +class TestUtils(unittest.TestCase): + + @patch('argparse.ArgumentParser.parse_args') + def test_parse_arguments(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + ontoFormat='turtle', + ontoPrecedence='enforcedPriority', + patchAcceptUpstream=False, + ontoVersion='originalFailoverLive', + onlyOntologies=True, + httpsIntercept=False, + inspectRedirects=True, + forwardHeaders=True, + subjectBinarySearchThreshold=100 + ) + + args = parse_arguments() + + self.assertEqual(args[0]['format'], 'turtle') + self.assertEqual(args[0]['precedence'], 'enforcedPriority') + self.assertFalse(args[0]['patchAcceptUpstream']) + self.assertEqual(args[1], 'originalFailoverLive') + self.assertTrue(args[2]) + self.assertFalse(args[3]) + self.assertTrue(args[4]) + self.assertTrue(args[5]) + self.assertEqual(args[6], 100) + + mock_parse_args.return_value = argparse.Namespace( + ontoFormat='ntriples', + ontoPrecedence='default', + patchAcceptUpstream=True, + ontoVersion='latestArchive', + onlyOntologies=False, + httpsIntercept=True, + inspectRedirects=False, + forwardHeaders=False, + subjectBinarySearchThreshold=50 + ) + + args = parse_arguments() + + self.assertEqual(args[0]['format'], 'ntriples') + self.assertEqual(args[0]['precedence'], 'default') + self.assertTrue(args[0]['patchAcceptUpstream']) + self.assertEqual(args[1], 'latestArchive') + self.assertFalse(args[2]) + self.assertTrue(args[3]) + self.assertFalse(args[4]) + self.assertFalse(args[5]) + self.assertEqual(args[6], 50) + + + @patch('requests.get') + def test_fetch_from_dbpedia_archivo_api(self, mock_get): + mock_response = Mock() + mock_response.status_code = 200 + mock_get.return_value = mock_response + + ontology = 'http://dbpedia.org/ontology/Person' + headers = {'Accept': 'text/turtle'} + + response = fetch_from_dbpedia_archivo_api(ontology, headers) + self.assertEqual(response.status_code, 200) + + mock_get.side_effect = requests.exceptions.RequestException + response = fetch_from_dbpedia_archivo_api(ontology, headers) + self.assertEqual(response.status_code, 404) + + def test_map_mime_to_format(self): + self.assertEqual(map_mime_to_format('application/rdf+xml'), 'owl') + self.assertEqual(map_mime_to_format('text/turtle'), 'ttl') + self.assertEqual(map_mime_to_format('application/n-triples'), 'nt') + self.assertIsNone(map_mime_to_format('unknown/mime')) + + def test_get_parameters_from_headers(self): + headers = { + 'Accept': 'application/rdf+xml', + 'Version': '1.0', + 'VersionMatching': 'exact' + } + format, version, versionMatching = get_parameters_from_headers(headers) + self.assertEqual(format, 'owl') + self.assertEqual(version, '1.0') + self.assertEqual(versionMatching, 'exact') + + headers = { + 'Accept': 'unknown/mime', + 'Version': '2.0', + 'VersionMatching': 'compatible' + } + format, version, versionMatching = get_parameters_from_headers(headers) + self.assertIsNone(format) + self.assertEqual(version, '2.0') + self.assertEqual(versionMatching, 'compatible') + + + +class TestMockResponses(unittest.TestCase): + + def test_mock_response_200(self): + response = mock_response_200() + self.assertEqual(response.status_code, 200) + self.assertEqual(response.url, 'https://example.com/success') + self.assertEqual(response.headers['Content-Type'], 'text/html') + self.assertIn(b'

To be implemented

', response.content) + + def test_mock_response_403(self): + response = mock_response_403() + self.assertEqual(response.status_code, 403) + self.assertEqual(response.url, 'https://example.com/forbidden') + self.assertEqual(response.headers['Content-Type'], 'text/html') + self.assertIn(b'

403 Forbidden

', response.content) + + def test_mock_response_404(self): + response = mock_response_404() + self.assertEqual(response.status_code, 404) + self.assertEqual(response.url, 'https://example.com/resource-not-found') + self.assertEqual(response.headers['Content-Type'], 'text/html') + self.assertIn(b'

404 Not Found

', response.content) + + def test_mock_response_500(self): + response = mock_response_500() + self.assertEqual(response.status_code, 500) + self.assertEqual(response.url, 'https://example.com/internal-server-error') + self.assertEqual(response.headers['Content-Type'], 'text/html') + self.assertIn(b'

500 Internal Server Error

', response.content) \ No newline at end of file