Skip to content

Eliminating Duplications (just the history of the application)

James Kent edited this page Mar 22, 2024 · 2 revisions

from sqlalchemy import func query = session.query(BaseStudy.doi, func.count(BaseStudy.doi)).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) query = db.session.query(BaseStudy.doi, func.count(BaseStudy.doi)).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois empty_string_dois = BaseStudy.query.filter_by(doi="").all() empty_string_dois for bs in empty_string_dois: bs.doi = None db.session.add_all(empty_string_dois) db.session.commit() empty_string_pmids = BaseStudy.query.filter_by(pmid="").all() for bs in empty_string_pmids: bs.pmids = None db.session.add_all(empty_string_pmids) db.session.commit() query = db.session.query(BaseStudy.doi, func.count(BaseStudy.doi)).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois dup_dois = query.all() dup_dois query = db.session.query(BaseStudy.doi, BaseStudy.id).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() db.session.rollback() dup_dois = query.all() dup_dois query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, BaseStudy.id).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, func.group_concat(BaseStudy.id)).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, func.string_agg(BaseStudy.id, ',')).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() db.session.rollback() query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, func.string_agg(BaseStudy.id, ',')).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() db.session.rollback() query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois dup_dois[0][0] BaseStudy.query.filter_by(dup_dois[0][0]) BaseStudy.query.filter_by(doi=dup_dois[0][0]) BaseStudy.query.filter_by(doi=dup_dois[0][0]).all() pip install biopython from Bio import Entrez

def doi_to_pmid(doi): Entrez.email = "[email protected]" # Put your email here handle = Entrez.esearch(db="pubmed", term=doi) record = Entrez.read(handle) handle.close() pmids = record['IdList'] return pmids from Bio import Entrez

def doi_to_pmid(doi): Entrez.email = "[email protected]" # Put your email here handle = Entrez.esearch(db="pubmed", term=doi) record = Entrez.read(handle) handle.close() pmids = record['IdList'] return pmids doi_to_pmid(dup_dois[0][0]) dup_bs = BaseStudy.query.filter_by(doi=dup_dois[0][0]).all() dup_bs dup_bs[0].pmid dup_bs[1].pmid def get_paper_doi(pmid): Entrez.email = "[email protected]" # Put your email here handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml") record = Entrez.read(handle) handle.close() if record['PubmedArticle']: article = record['PubmedArticle'][0]['PubmedData']['ArticleIdList'] for item in article: if item.attributes['IdType'] == 'doi': return item return "DOI not found for the given PMID." get_paper_doi(dup_bs[0].pmid) res = get_paper_doi(dup_bs[0].pmid) str(res) def get_paper_doi(pmid): Entrez.email = "[email protected]" # Put your email here handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml") record = Entrez.read(handle) handle.close() if record['PubmedArticle']: article = record['PubmedArticle'][0]['PubmedData']['ArticleIdList'] for item in article: if item.attributes['IdType'] == 'doi': return str(item) return "DOI not found for the given PMID." to_commit = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] for bs in dup_bs: if bs.pmid == pmid: continue if bs.pmid: doi = get_paper_doi(bs.pmid) if "found" not in doi else bs.doi if doi != bs.doi: bs.doi = doi to_commit.append(bs) print(f"CHANGING {bs.name}") for v in bs.versions: v.doi = doi to_commit.append(v) to_commit = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] for bs in dup_bs: if bs.pmid == pmid: continue if bs.pmid: doi = get_paper_doi(bs.pmid) doi = doi if "found" not in doi else bs.doi if doi != bs.doi: bs.doi = doi to_commit.append(bs) print(f"CHANGING {bs.name}") for v in bs.versions: v.doi = doi to_commit.append(v) to_commit db.session.add_all(to_commit) db.session.commit() dup_bs = BaseStudy.query.filter_by(doi=dup_dois[0][0]).all() dup_bs query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois bss = BaseStudy.query.filter_by(doi="10.1016/j.cortex.2013.01.015").all() bss bss[0].doi bss[0].pmid bss[1].pmid bss[1].name bss[0].name doi_to_pmid(bss[0].doi) get_paper_doi('23994216') to_commit = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] for bs in dup_bs: if bs.pmid == pmid: continue if bs.pmid: doi = get_paper_doi(bs.pmid) doi = doi if "found" not in doi else bs.doi if doi != bs.doi: bs.doi = doi to_commit.append(bs) print(f"CHANGING {bs.name}") for v in bs.versions: v.doi = doi to_commit.append(v) db.session.add_all(to_commit) db.session.commit() query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois to_commit = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] for bs in dup_bs: if bs.pmid == pmid: continue if bs.pmid: doi = get_paper_doi(bs.pmid) doi = doi if "found" not in doi else bs.doi if doi != bs.doi: bs.doi = doi to_commit.append(bs) print(f"CHANGING {bs.name}") for v in bs.versions: v.doi = doi to_commit.append(v) to_commit bss = BaseStudy.query.filter_by(doi="10.1016/j.neurobiolaging.2023.08.008").all() bss bss[0].pmid bss[1].pmid doi_to_pmid(bss[1].pmid) doi_to_pmid(bss[1].doi) doi_to_pmid(bss[0].pmid) doi_to_pmid(bss[0].doi) doi_to_pmid(bss[1].pmid) to_commit = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0])[0] non_matching_bs = [] for bs in dup_bs: if bs.pmid == pmid: continue if bs.pmid: doi = get_paper_doi(bs.pmid) doi = doi if "found" not in doi else bs.doi if doi != bs.doi: bs.doi = doi to_commit.append(bs) print(f"CHANGING {bs.name}") for v in bs.versions: v.doi = doi to_commit.append(v) query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1) dup_pmids = query.all() dup_pmids empty_string_pmids = BaseStudy.query.filter_by(pmid='').all() len(empty_string_pmids) to_commit=[] for bs in empty_string_pmids: bs.pmid = None to_commit.append(bs) for v in bs.versions: v.pmid = None if v.pmid == '' else v.pmid to_commit.append(v) db.session.add_all(to_commit) db.session.commit() empty_string_pmids = BaseStudy.query.filter_by(pmid='').all() empty_string_pmids query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1) dup_pmids = query.all() dup_pmids BaseStudy.query.filter_by(pmid="10669519").all() bss = BaseStudy.query.filter_by(pmid="10669519").all() bss[0] bss[0].doi bss[1].doi bss[0].name bss[1].name bss[1].user bss[0].user bss[1].user bss[1].doi bss[1].pmid bss[1].versions bss[0].versions bss[0].versions[0].user bss[0].versions[0].source bss[0].versions[1].source bss[0].versions[2].source bss[0].abstracts bss[0].description bss[1].description to_commit = [] for dup in dup_pmids: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() doi = get_paper_doi(dup[0]) non_matching_bs = [] for bs in dup_bs: if bs.doi == doi: continue if bs.doi: pmid = doi_to_pmid(bs.doi)[0] if pmid != bs.pmid: bs.pmid = pmid to_commit.append(bs) print(f"CHANGING {bs.name}") for v in bs.versions: v.pmid = pmid to_commit.append(v) to_commit db.session.add_all(to_commit) db.session.commit() query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1) dup_pmids = query.all() dup_pmids to_commit = [] for dup in dup_pmids: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() doi = get_paper_doi(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.doi == doi: name = bs.name else: non_matching_dups.append(bs) non_matching_dups import re

def preprocess_string(string): # Remove non-alphanumeric characters and whitespace return re.sub(r'[^a-zA-Z0-9]', '', string)

def compare_strings(string1, string2): processed_string1 = preprocess_string(string1) processed_string2 = preprocess_string(string2) return processed_string1 == processed_string2 to_commit = [] for dup in dup_pmids: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() doi = get_paper_doi(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.doi == doi: name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name, name) if match: print(f"DUP: {name}") to_commit = [] to_delete = [] for dup in dup_pmids: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() doi = get_paper_doi(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.doi == doi: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name, name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit = [] to_delete = [] for dup in dup_pmids: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() doi = get_paper_doi(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.doi == doi: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name, name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) to_commit to_delete to_delete[0].versions to_delete[1].versions to_delete[-`].versions to_delete[-1].versions for bs in to_delete: db.session.delete(bs) db.session.commit() db.session.add_all(to_commit) db.session.commit() query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1) dup_pmids = query.all() dup_pmids BaseStudy.query.filter_by(pmid='16371250').all() bss = BaseStudy.query.filter_by(pmid='16371250').all() bss[0] bss[0].name bss[1].name compare_strings(bss[0].name, bss[1].name) preprocess_string(bss[0].name) preprocess_string(bss[1].name) preprocess_string(bss[0].name) == preprocess_string(bss[1].name) preprocess_string(bss[0].name).lower() == preprocess_string(bss[1].name).lower() import re

def preprocess_string(string): # Remove non-alphanumeric characters and whitespace return re.sub(r'[^a-zA-Z0-9]', '', string)

def compare_strings(string1, string2): processed_string1 = preprocess_string(string1).lower() processed_string2 = preprocess_string(string2).lower() return processed_string1 == processed_string2 to_commit = [] to_delete = [] for dup in dup_pmids: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() doi = get_paper_doi(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.doi == doi: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name, name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) to_delete to_delete[0] to_delete[0].name to_delete[4].name to_delete[5].name to_delete[6].name to_delete[12].name to_delete[12].doi to_delete[12].pmid to_delete[13].pmid to_delete[13].name to_delete[13].doi to_delete[13].versions to_commit to_commit[12] to_commit[12].name to_commit[12].versions to_delete[13].user to_delete[12].user for bs in to_delete: db.session.delete(bs) db.session.add_all(to_commit) db.session.commit() query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1) dup_pmids = query.all() dup_pmids dup_dois BaseStudy.query.filter_by(doi="10.1016/j.neurobiolaging.2023.08.008").all() bss = BaseStudy.query.filter_by(doi="10.1016/j.neurobiolaging.2023.08.008").all() bss[0].pmid bss[1].pmid bss[1].name bss[0].name bss[0].user bss[0].versions bss[1].versions to_commit = [] to_delete = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.pmid == pmid: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name, name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) to_commit = [] to_delete = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.pmid == pmid: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name, name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) to_commit = [] to_delete = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() pmid = doi_to_pmid(dup[0]) non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.pmid == pmid: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name or '', name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) bss[0].name bss[1].name compare_strings(bss[0], bss[1]) compare_strings(bss[0].name, bss[1].name) dup_dois bss[0].name bss[0].pmis bss[0].pmid bss[0].doi get_paper_doi(bss[0]).doi get_paper_doi(bss[0]) get_paper_doi(bss[0].doi) get_paper_doi(bss[0].pmid) get_paper_doi(bss[0].pmid) get_paper_doi(bss[0].doi) bss[0].doi doi_to_pmid(bss[0].doi) get_paper_doi(bss[0].pmid) get_paper_doi(bss[1].pmid) to_commit = [] to_delete = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() pmid = doi_to_pmid(dup[0])[0] non_matching_bs = [] name = None non_matching_dups = [] for bs in dup_bs: if bs.pmid == pmid: main_bs = bs name = bs.name else: non_matching_dups.append(bs) for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name or '', name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) to_commit doi_to_pmid(bss[0].doi) doi_to_pmid(bss[0].doi)[0] bss[0].pmid bss[1].pmid to_commit = [] to_delete = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all() pmid = doi_to_pmid(dup[0])[0] non_matching_bs = [] name = None main_bs = None non_matching_dups = [] for bs in dup_bs: if bs.pmid == pmid: main_bs = bs name = bs.name else: non_matching_dups.append(bs) if not main_bs: print('no main bs found') continue for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name or '', name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) dup_dois to_commit = [] to_delete = [] for dup in dup_dois: dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all() pmid = doi_to_pmid(dup[0])[0] non_matching_bs = [] name = None main_bs = None non_matching_dups = [] for bs in dup_bs: if bs.pmid == pmid: main_bs = bs name = bs.name else: non_matching_dups.append(bs) if not main_bs: print('no main bs found') continue for nm_bs in non_matching_dups: match = compare_strings(nm_bs.name or '', name) if match: print(f"DUP: {name}") main_bs.versions.extend(nm_bs.versions) for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']: setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None))) to_delete.append(nm_bs) to_commit.append(main_bs) to_delete to_delete[0].name to_delete[0].versions to_commit to_commit[0].name to_commit[0].doi to_commit[0].pmid for bs in to_delete: db.session.delete(bs) db.session.add_all(to_commit) db.session.commit() query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois BaseStudy.query.filter_by(doi="10.1016/j.neuroimage.2014.06.047").all() bss = BaseStudy.query.filter_by(doi="10.1016/j.neuroimage.2014.06.047").all() bss[0] bss[0].pmid bss[0].name bss[1].name bss[1].pmid bss[1].user bss[0].user bss[1].versions bss[1].versions[0] bss[1].versions[0].source bss[1].versions[1].source bss[0].versions[1].source bss[0].versions[0].source bss[0].versions[0] bss[0].versions[0].name bss[0].pmid bss[1].version bss[1].versions bss[1].versions.extend(bss[0].versions) bss[1 ] bss[1].versions bss[1].publication bss[0].publication bss[0].year bss[1].year bss[0].versions bss[0].description bss[1].description db.session.delete(bss[0]) db.session.add(bss[1]) db.session.commit() bss[1].versions query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1) dup_dois = query.all() dup_dois history

Clone this wiki locally