[CLEANUP]

The-Swarm-Corporation · Sep 11, 2024 · 2edb48c · 2edb48c
1 parent 29cde44
commit 2edb48c
Show file tree

Hide file tree

Showing 6 changed files with 198 additions and 1,322 deletions.
diff --git a/agent_workspace/Medical-Summarization-Agent_state.json b/agent_workspace/Medical-Summarization-Agent_state.json
diff --git a/example.py b/example.py
@@ -1,7 +1,7 @@
 from medinsight.agent import MedInsightPro
 
 # Initialize the MedInsight Pro agent
-agent = MedInsightPro()
+agent = MedInsightPro(max_articles=4)
 
 # Run a query to summarize the latest medical research on COVID-19 treatments
 output = agent.run("COVID-19 treatments")

diff --git a/medinsight/__init__.py b/medinsight/__init__.py
@@ -1,3 +1,4 @@
 from medinsight.agent import MedInsightPro
+from medinsight.pub_med import query_pubmed_with_abstract
 
-__all__ = ["MedInsightPro"]
+__all__ = ["MedInsightPro", "query_pubmed_with_abstract"]
diff --git a/medinsight/agent.py b/medinsight/agent.py
@@ -8,6 +8,7 @@
 from swarms import OpenAIChat
 from swarms import Agent
 from dotenv import load_dotenv
+from medinsight.pub_med import query_pubmed_with_abstract
 
 load_dotenv()
 
@@ -81,23 +82,25 @@ def __init__(
         semantic_scholar_api_key: str = None,
         system_prompt: str = med_sys_prompt,
         agent: Agent = agent,
+        max_articles: int = 10,
     ):
         self.pubmed_api_key = pubmed_api_key
         self.semantic_scholar_api_key = semantic_scholar_api_key
         self.system_prompt = system_prompt
         self.agent = agent
+        self.max_articles = max_articles
 
         # Initialize the metadata history log
         self.metadata_log: List[MedInsightMetadata] = []
 
     # Function to access PubMed data
-    def fetch_pubmed_data(self, query, max_results=10):
+    def fetch_pubmed_data(self, query: str):
         logger.info(f"Fetching data from PubMed for query: {query}")
         url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
         params = {
             "db": "pubmed",
             "term": query,
-            "retmax": max_results,
+            "retmax": self.max_articles,
             "api_key": self.pubmed_api_key,
             "retmode": "json",
         }
@@ -116,11 +119,14 @@ def fetch_pubmed_data(self, query, max_results=10):
             fetch_response = requests.get(
                 fetch_url, params=fetch_params
             )
-            return fetch_response.json()
+
+            return json.dumps(fetch_response.json())
         return {}
 
     # Function to access Semantic Scholar data
-    def fetch_semantic_scholar_data(self, query, max_results=10):
+    def fetch_semantic_scholar_data(
+        self, query: str, max_results: int = 10
+    ):
         logger.info(
             f"Fetching data from Semantic Scholar for query: {query}"
         )
@@ -134,13 +140,15 @@ def fetch_semantic_scholar_data(self, query, max_results=10):
     def run(self, task: str):
         logger.info(f"Running MedInsightPro agent for task: {task}")
         status = "success"
-        pubmed_data, semantic_scholar_data = {}, {}
+        # pubmed_data, semantic_scholar_data = {}, {}
         combined_summary = ""
 
         try:
             # Fetch data from PubMed
             if self.pubmed_api_key:
-                pubmed_data = self.fetch_pubmed_data(task)
+                pubmed_data = query_pubmed_with_abstract(
+                    query=task, max_articles=self.max_articles
+                )
                 logger.info(f"PubMed data: {pubmed_data}")
 
             # Fetch data from Semantic Scholar
@@ -150,20 +158,26 @@ def run(self, task: str):
                 )
 
             # Summarize data with GPT-4
-            combined_summary_input = f"PubMed Data: {pubmed_data}\nSemantic Scholar Data: {semantic_scholar_data}"
+            # combined_summary_input = f"PubMed Data: {pubmed_data}\nSemantic Scholar Data: {semantic_scholar_data}"
+            if pubmed_data:
+                combined_summary_input = pubmed_data
+            else:
+                combined_summary_input = semantic_scholar_data
+
             combined_summary = self.agent.run(combined_summary_input)
             logger.info(f"Summarization completed for task: {task}")
         except Exception as e:
             logger.error(
                 f"Error during processing task: {task}. Error: {e}"
             )
             status = "failure"
+            raise e
 
         # Log metadata
         metadata = MedInsightMetadata(
             query=task,
-            pubmed_results=pubmed_data,
-            semantic_scholar_results=semantic_scholar_data,
+            # pubmed_results=pubmed_data,
+            # semantic_scholar_results=semantic_scholar_data,
             combined_summary=combined_summary,
             status=status,
         )

diff --git a/medinsight/pub_med.py b/medinsight/pub_med.py
@@ -0,0 +1,161 @@
+import os
+from Bio import Entrez
+from loguru import logger
+from dotenv import load_dotenv
+from typing import Optional
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Check if email is set in the environment
+ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
+
+if not ENTREZ_EMAIL:
+    raise EnvironmentError(
+        "ENTREZ_EMAIL is not set in the environment. Please set it in your .env file."
+    )
+
+Entrez.email = ENTREZ_EMAIL  # Set email for Entrez queries
+
+logger.add("pubmed_query.log", rotation="1 MB")  # Rotating log file
+
+
+def query_pubmed_with_abstract(
+    query: str,
+    max_articles: int = 10,
+    start_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    journal: Optional[str] = None,
+    author: Optional[str] = None,
+):
+    """
+    Query PubMed for articles and return their title, authors, abstract, etc.
+
+    Args:
+    query (str): The search query.
+    max_articles (int): Maximum number of articles to retrieve.
+    start_date (Optional[str]): Start date for filtering (e.g., "2020/01/01").
+    end_date (Optional[str]): End date for filtering (e.g., "2023/12/31").
+    journal (Optional[str]): Filter by journal name.
+    author (Optional[str]): Filter by author name.
+
+    Returns:
+    List of dict: A list of dictionaries containing article info.
+    """
+    try:
+        # Build the search query with optional filters
+        search_query = query
+        if journal:
+            search_query += f' AND "{journal}"[Journal]'
+        if author:
+            search_query += f" AND {author}[Author]"
+        if start_date and end_date:
+            search_query += f" AND ({start_date}[Date - Publication] : {end_date}[Date - Publication])"
+
+        logger.info(f"Querying PubMed with search: {search_query}")
+
+        # Fetch search results from PubMed
+        handle = Entrez.esearch(
+            db="pubmed", term=search_query, retmax=max_articles
+        )
+        record = Entrez.read(handle)
+        handle.close()
+
+        id_list = record["IdList"]
+        logger.info(
+            f"Found {len(id_list)} articles for query: {search_query}"
+        )
+
+        if not id_list:
+            logger.warning("No articles found.")
+            return []
+
+        # Fetch article details (XML format)
+        handle = Entrez.efetch(
+            db="pubmed",
+            id=",".join(id_list),
+            rettype="xml",
+            retmode="text",
+        )
+        articles = Entrez.read(handle)
+        handle.close()
+
+        article_list = []
+
+        # Extract information from articles
+        for article in articles["PubmedArticle"]:
+            article_data = {}
+            medline_citation = article.get("MedlineCitation", {})
+            article_metadata = medline_citation.get("Article", {})
+
+            article_data["Title"] = article_metadata.get(
+                "ArticleTitle", "N/A"
+            )
+            article_data["PMID"] = medline_citation.get("PMID", "N/A")
+            article_data["Authors"] = [
+                (
+                    f"{author['LastName']} {author.get('Initials', '')}"
+                    if "LastName" in author
+                    else "Unknown Author"
+                )
+                for author in article_metadata.get("AuthorList", [])
+            ]
+            article_data["Source"] = article_metadata.get(
+                "Journal", {}
+            ).get("Title", "N/A")
+            article_data["PublicationDate"] = (
+                article_metadata.get("Journal", {})
+                .get("JournalIssue", {})
+                .get("PubDate", "N/A")
+            )
+
+            # Extract abstract if available
+            abstract = article_metadata.get("Abstract", {}).get(
+                "AbstractText", []
+            )
+            article_data["Abstract"] = (
+                " ".join(str(part) for part in abstract)
+                if abstract
+                else "N/A"
+            )
+
+            article_list.append(article_data)
+
+        logger.info(
+            f"Successfully retrieved {len(article_list)} articles."
+        )
+        # Output the results
+        # Output the results as a single string
+        merged_string = ""
+        for (
+            article
+        ) in article_list:  # Changed from articles to article_list
+            title = f"Title: {article['Title']}"
+            pmid = f"PMID: {article['PMID']}"
+            authors = f"Authors: {article['Authors']}"
+            source = f"Source: {article['Source']}"
+            publication_date = (
+                f"Publication Date: {article['PublicationDate']}"
+            )
+            abstract = f"Abstract: {article['Abstract']}"
+            merged_string += f"{title}\n{pmid}\n{authors}\n{source}\n{publication_date}\n{abstract}\n"  # Concatenate to merged_string
+            merged_string += "-" * 40 + "\n"  # Add separator
+            merged_string += "\n"
+
+        # print(merged_string)  # Print the final merged string
+        return merged_string
+    except Exception as e:
+        logger.exception(
+            f"An error occurred during the PubMed query: {e}"
+        )
+        return []
+
+
+# # Example usage with more search features
+# articles = query_pubmed_with_abstract(
+#     query="deep learning in medical imaging",
+#     max_articles=20,
+# )
+
+
+# print(articles)