|
| 1 | +import os |
| 2 | +from Bio import Entrez |
| 3 | +from loguru import logger |
| 4 | +from dotenv import load_dotenv |
| 5 | +from typing import Optional |
| 6 | + |
| 7 | +# Load environment variables from .env file |
| 8 | +load_dotenv() |
| 9 | + |
| 10 | +# Check if email is set in the environment |
| 11 | +ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL") |
| 12 | + |
| 13 | +if not ENTREZ_EMAIL: |
| 14 | + raise EnvironmentError( |
| 15 | + "ENTREZ_EMAIL is not set in the environment. Please set it in your .env file." |
| 16 | + ) |
| 17 | + |
| 18 | +Entrez.email = ENTREZ_EMAIL # Set email for Entrez queries |
| 19 | + |
| 20 | +logger.add("pubmed_query.log", rotation="1 MB") # Rotating log file |
| 21 | + |
| 22 | + |
| 23 | +def query_pubmed_with_abstract( |
| 24 | + query: str, |
| 25 | + max_articles: int = 10, |
| 26 | + start_date: Optional[str] = None, |
| 27 | + end_date: Optional[str] = None, |
| 28 | + journal: Optional[str] = None, |
| 29 | + author: Optional[str] = None, |
| 30 | +): |
| 31 | + """ |
| 32 | + Query PubMed for articles and return their title, authors, abstract, etc. |
| 33 | +
|
| 34 | + Args: |
| 35 | + query (str): The search query. |
| 36 | + max_articles (int): Maximum number of articles to retrieve. |
| 37 | + start_date (Optional[str]): Start date for filtering (e.g., "2020/01/01"). |
| 38 | + end_date (Optional[str]): End date for filtering (e.g., "2023/12/31"). |
| 39 | + journal (Optional[str]): Filter by journal name. |
| 40 | + author (Optional[str]): Filter by author name. |
| 41 | +
|
| 42 | + Returns: |
| 43 | + List of dict: A list of dictionaries containing article info. |
| 44 | + """ |
| 45 | + try: |
| 46 | + # Build the search query with optional filters |
| 47 | + search_query = query |
| 48 | + if journal: |
| 49 | + search_query += f' AND "{journal}"[Journal]' |
| 50 | + if author: |
| 51 | + search_query += f" AND {author}[Author]" |
| 52 | + if start_date and end_date: |
| 53 | + search_query += f" AND ({start_date}[Date - Publication] : {end_date}[Date - Publication])" |
| 54 | + |
| 55 | + logger.info(f"Querying PubMed with search: {search_query}") |
| 56 | + |
| 57 | + # Fetch search results from PubMed |
| 58 | + handle = Entrez.esearch( |
| 59 | + db="pubmed", term=search_query, retmax=max_articles |
| 60 | + ) |
| 61 | + record = Entrez.read(handle) |
| 62 | + handle.close() |
| 63 | + |
| 64 | + id_list = record["IdList"] |
| 65 | + logger.info( |
| 66 | + f"Found {len(id_list)} articles for query: {search_query}" |
| 67 | + ) |
| 68 | + |
| 69 | + if not id_list: |
| 70 | + logger.warning("No articles found.") |
| 71 | + return [] |
| 72 | + |
| 73 | + # Fetch article details (XML format) |
| 74 | + handle = Entrez.efetch( |
| 75 | + db="pubmed", |
| 76 | + id=",".join(id_list), |
| 77 | + rettype="xml", |
| 78 | + retmode="text", |
| 79 | + ) |
| 80 | + articles = Entrez.read(handle) |
| 81 | + handle.close() |
| 82 | + |
| 83 | + article_list = [] |
| 84 | + |
| 85 | + # Extract information from articles |
| 86 | + for article in articles["PubmedArticle"]: |
| 87 | + article_data = {} |
| 88 | + medline_citation = article.get("MedlineCitation", {}) |
| 89 | + article_metadata = medline_citation.get("Article", {}) |
| 90 | + |
| 91 | + article_data["Title"] = article_metadata.get( |
| 92 | + "ArticleTitle", "N/A" |
| 93 | + ) |
| 94 | + article_data["PMID"] = medline_citation.get("PMID", "N/A") |
| 95 | + article_data["Authors"] = [ |
| 96 | + ( |
| 97 | + f"{author['LastName']} {author.get('Initials', '')}" |
| 98 | + if "LastName" in author |
| 99 | + else "Unknown Author" |
| 100 | + ) |
| 101 | + for author in article_metadata.get("AuthorList", []) |
| 102 | + ] |
| 103 | + article_data["Source"] = article_metadata.get( |
| 104 | + "Journal", {} |
| 105 | + ).get("Title", "N/A") |
| 106 | + article_data["PublicationDate"] = ( |
| 107 | + article_metadata.get("Journal", {}) |
| 108 | + .get("JournalIssue", {}) |
| 109 | + .get("PubDate", "N/A") |
| 110 | + ) |
| 111 | + |
| 112 | + # Extract abstract if available |
| 113 | + abstract = article_metadata.get("Abstract", {}).get( |
| 114 | + "AbstractText", [] |
| 115 | + ) |
| 116 | + article_data["Abstract"] = ( |
| 117 | + " ".join(str(part) for part in abstract) |
| 118 | + if abstract |
| 119 | + else "N/A" |
| 120 | + ) |
| 121 | + |
| 122 | + article_list.append(article_data) |
| 123 | + |
| 124 | + logger.info( |
| 125 | + f"Successfully retrieved {len(article_list)} articles." |
| 126 | + ) |
| 127 | + # Output the results |
| 128 | + # Output the results as a single string |
| 129 | + merged_string = "" |
| 130 | + for ( |
| 131 | + article |
| 132 | + ) in article_list: # Changed from articles to article_list |
| 133 | + title = f"Title: {article['Title']}" |
| 134 | + pmid = f"PMID: {article['PMID']}" |
| 135 | + authors = f"Authors: {article['Authors']}" |
| 136 | + source = f"Source: {article['Source']}" |
| 137 | + publication_date = ( |
| 138 | + f"Publication Date: {article['PublicationDate']}" |
| 139 | + ) |
| 140 | + abstract = f"Abstract: {article['Abstract']}" |
| 141 | + merged_string += f"{title}\n{pmid}\n{authors}\n{source}\n{publication_date}\n{abstract}\n" # Concatenate to merged_string |
| 142 | + merged_string += "-" * 40 + "\n" # Add separator |
| 143 | + merged_string += "\n" |
| 144 | + |
| 145 | + # print(merged_string) # Print the final merged string |
| 146 | + return merged_string |
| 147 | + except Exception as e: |
| 148 | + logger.exception( |
| 149 | + f"An error occurred during the PubMed query: {e}" |
| 150 | + ) |
| 151 | + return [] |
| 152 | + |
| 153 | + |
| 154 | +# # Example usage with more search features |
| 155 | +# articles = query_pubmed_with_abstract( |
| 156 | +# query="deep learning in medical imaging", |
| 157 | +# max_articles=20, |
| 158 | +# ) |
| 159 | + |
| 160 | + |
| 161 | +# print(articles) |
0 commit comments