From a23f59e62fb755fcf55d57dbc2078701e775830d Mon Sep 17 00:00:00 2001 From: Dominik Lindner Date: Tue, 22 Aug 2023 11:36:31 +0100 Subject: [PATCH 1/3] Add script to fix duplicated URLs annotations --- scripts/annotate/fix_annotations.py | 71 +++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 scripts/annotate/fix_annotations.py diff --git a/scripts/annotate/fix_annotations.py b/scripts/annotate/fix_annotations.py new file mode 100644 index 00000000..65b2e1e5 --- /dev/null +++ b/scripts/annotate/fix_annotations.py @@ -0,0 +1,71 @@ +import argparse +import re + +import omero.cli + +parser = argparse.ArgumentParser(description="Generate SQL script to fix duplicated URL annotations") +parser.add_argument("url", help="Regex of the URL to check for (incl a named group to match the ID!)" + ", e.g. \".+ncbi\.nlm\.nih\.gov\/gene\/(?P.+)\"") +parser.add_argument("namespace", help="The namespace of the annotations, e.g. openmicroscopy.org/mapr/gene") + +url_names = {"openmicroscopy.org/mapr/gene": "Gene Identifier URL", + "openmicroscopy.org/mapr/compound": "Compound Name URL"} + +pref_urls = [re.compile(r"^https:\/\/(?!www).+"),\ + re.compile(r"^https:\/\/.+")] # prefer https:// without www over with www + +def load_annotations(conn, namespace): + metadataService = conn.getMetadataService() + annotations = metadataService.loadSpecifiedAnnotations( + 'omero.model.MapAnnotation', [namespace], None, None) + for ann in annotations: + yield ann + +def get_urls(ann, pattern): + urls = [] + ids = set() + symbol = "" + for nv in ann._mapValue: + if nv.name == url_names[ann._ns._val]: + m = pattern.match(nv.value) + if m: + urls.append(nv.value) + ids.add(m.group("ID")) + if "Symbol" in nv.name: + symbol = nv.value + elif "Name" in nv.name: + symbol = nv.value + if len(ids) > 1: + raise Exception(f"IDs don't match! ({ann._id._val}, {symbol}, {urls})") + return symbol, urls + + +def check_annotations(conn, args): + pattern = re.compile(f"{args.url}") + for ann in load_annotations(conn, args.namespace): + symbol, urls = get_urls(ann, pattern) + if (len(urls) > 1): + url_to_keep = None + for url in urls: + for pref_url in pref_urls: + if pref_url.match(url): + url_to_keep = url + break + if url_to_keep: + break + if url_to_keep: + urls_to_delete = [] + for url in urls: + if url != url_to_keep: + urls_to_delete.append(url) + print(f"-- Symbol: {symbol} - Annotation ID: {ann._id._val}") + print(f"-- URLs: {urls} - keep: {url_to_keep}") + for url in urls_to_delete: + print(f"DELETE FROM annotation_mapvalue mv WHERE mv.annotation_id = {ann._id._val} AND mv.value = '{url}';\n") + + +args = parser.parse_args() +with omero.cli.cli_login() as c: + conn = omero.gateway.BlitzGateway(client_obj=c.get_client()) + check_annotations(conn, args) + From 995a0a5f717aa573ec365e02291e99457493091f Mon Sep 17 00:00:00 2001 From: Dominik Lindner Date: Tue, 22 Aug 2023 14:29:58 +0100 Subject: [PATCH 2/3] Group urls by id not annotation id --- scripts/annotate/fix_annotations.py | 71 +++++++++++++++-------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/scripts/annotate/fix_annotations.py b/scripts/annotate/fix_annotations.py index 65b2e1e5..7594460b 100644 --- a/scripts/annotate/fix_annotations.py +++ b/scripts/annotate/fix_annotations.py @@ -1,67 +1,72 @@ import argparse import re - import omero.cli + parser = argparse.ArgumentParser(description="Generate SQL script to fix duplicated URL annotations") parser.add_argument("url", help="Regex of the URL to check for (incl a named group to match the ID!)" ", e.g. \".+ncbi\.nlm\.nih\.gov\/gene\/(?P.+)\"") parser.add_argument("namespace", help="The namespace of the annotations, e.g. openmicroscopy.org/mapr/gene") url_names = {"openmicroscopy.org/mapr/gene": "Gene Identifier URL", - "openmicroscopy.org/mapr/compound": "Compound Name URL"} + "openmicroscopy.org/mapr/compound": "Compound Name URL", + "openmicroscopy.org/mapr/phenotype": "Phenotype Term Accession URL"} pref_urls = [re.compile(r"^https:\/\/(?!www).+"),\ re.compile(r"^https:\/\/.+")] # prefer https:// without www over with www -def load_annotations(conn, namespace): + +def get_annotations(conn, namespace): + """ + Get all map annotation with the specific namespace + :param conn: Reference to the BlitzGateway + :param namespace: The namespace + :return: Generator for map annotations + """ metadataService = conn.getMetadataService() annotations = metadataService.loadSpecifiedAnnotations( 'omero.model.MapAnnotation', [namespace], None, None) for ann in annotations: yield ann + def get_urls(ann, pattern): - urls = [] - ids = set() - symbol = "" + """ + Get all URLs from the map annotation matching the given pattern + :param ann: The map annotation + :param pattern: The regex pattern + :return: Dictionary with lists of URLs (key: ID) + """ + urls = dict() for nv in ann._mapValue: if nv.name == url_names[ann._ns._val]: m = pattern.match(nv.value) if m: - urls.append(nv.value) - ids.add(m.group("ID")) - if "Symbol" in nv.name: - symbol = nv.value - elif "Name" in nv.name: - symbol = nv.value - if len(ids) > 1: - raise Exception(f"IDs don't match! ({ann._id._val}, {symbol}, {urls})") - return symbol, urls + if m.group("ID") not in urls: + urls[m.group("ID")] = [] + urls[m.group("ID")].append(nv.value) + return urls def check_annotations(conn, args): pattern = re.compile(f"{args.url}") - for ann in load_annotations(conn, args.namespace): - symbol, urls = get_urls(ann, pattern) - if (len(urls) > 1): - url_to_keep = None - for url in urls: - for pref_url in pref_urls: - if pref_url.match(url): - url_to_keep = url + for ann in get_annotations(conn, args.namespace): + for id, urls in get_urls(ann, pattern).items(): + if (len(urls) > 1): + url_to_keep = None + for url in urls: + for pref_url in pref_urls: + if pref_url.match(url): + url_to_keep = url + break + if url_to_keep: break if url_to_keep: - break - if url_to_keep: - urls_to_delete = [] - for url in urls: - if url != url_to_keep: - urls_to_delete.append(url) - print(f"-- Symbol: {symbol} - Annotation ID: {ann._id._val}") - print(f"-- URLs: {urls} - keep: {url_to_keep}") - for url in urls_to_delete: - print(f"DELETE FROM annotation_mapvalue mv WHERE mv.annotation_id = {ann._id._val} AND mv.value = '{url}';\n") + print(f"-- ID: {id} - Annotation ID: {ann._id._val}") + print(f"-- URLs: {urls} - keep: {url_to_keep}") + for url in urls: + if url != url_to_keep: + print(f"DELETE FROM annotation_mapvalue mv WHERE mv.annotation_id = {ann._id._val} AND mv.value = '{url}';\n") args = parser.parse_args() From f11273eec27cdbefb4692f950b4bb5d354b52d46 Mon Sep 17 00:00:00 2001 From: Dominik Lindner Date: Tue, 22 Aug 2023 14:48:58 +0100 Subject: [PATCH 3/3] Add antibody --- scripts/annotate/fix_annotations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/annotate/fix_annotations.py b/scripts/annotate/fix_annotations.py index 7594460b..3ec241a0 100644 --- a/scripts/annotate/fix_annotations.py +++ b/scripts/annotate/fix_annotations.py @@ -10,7 +10,8 @@ url_names = {"openmicroscopy.org/mapr/gene": "Gene Identifier URL", "openmicroscopy.org/mapr/compound": "Compound Name URL", - "openmicroscopy.org/mapr/phenotype": "Phenotype Term Accession URL"} + "openmicroscopy.org/mapr/phenotype": "Phenotype Term Accession URL", + "openmicroscopy.org/mapr/antibody": "Antibody Identifier URL"} pref_urls = [re.compile(r"^https:\/\/(?!www).+"),\ re.compile(r"^https:\/\/.+")] # prefer https:// without www over with www