Merge remote-tracking branch 'origin/master'

ConnectedSystems · Apr 19, 2019 · a992186 · a992186
2 parents b767779 + 8d2341d
commit a992186
Show file tree

Hide file tree

Showing 10 changed files with 208 additions and 57 deletions.
diff --git a/wosis/TopicResult.py b/wosis/TopicResult.py
@@ -73,9 +73,6 @@ def find_paper_by_id(self, wos_id):
         ==========
         * DataFrame of the matching topic, or None if not found.
         """
-        # tmp_df = self.corpora_df
-        # tmp_df.loc[tmp_df.id == wos_id]
-
         for i in range(self.num_topics):
             topic_id = i + 1
             tmp_topic = self.get_topic_by_id(topic_id)
@@ -85,6 +82,7 @@ def find_paper_by_id(self, wos_id):
                 return tmp_topic
             # End if
         # End for
+    # End find_paper_by_id
 
     def find_paper_by_doi(self, doi):
         """Search for a given record based on its DOI
@@ -97,9 +95,6 @@ def find_paper_by_doi(self, doi):
         ==========
         * DataFrame of the matching topic, or None if not found.
         """
-        # tmp_df = self.corpora_df
-        # tmp_df.loc[tmp_df.DOI == doi]
-
         for i in range(self.num_topics):
             topic_id = i + 1
             tmp_topic = self.get_topic_by_id(topic_id)
@@ -109,6 +104,4 @@ def find_paper_by_doi(self, doi):
                 return tmp_topic
             # End if
         # End for
-
-
-    # End find_paper_by_id()
+    # End find_paper_by_doi()
diff --git a/wosis/analysis/__init__.py b/wosis/analysis/__init__.py
@@ -1,3 +1,4 @@
 from .search import *
 from .pub_details import *
-from .constrain import *
+from .constrain import *
+from .stats import *
diff --git a/wosis/analysis/constrain.py b/wosis/analysis/constrain.py
@@ -1,4 +1,6 @@
+import os
 import pandas as pd
+import pickle 
 
 import nltk
 from nltk.stem import SnowballStemmer, WordNetLemmatizer
@@ -19,7 +21,7 @@
 from zipfile import BadZipfile
 
 __all__ = ['find_topics', 'find_phrases', 'remove_by_journals',
-           'remove_by_title', 'remove_empty_DOIs', 'remove_by_keywords']
+           'remove_by_title', 'remove_empty_DOIs', 'remove_duplicates', 'remove_by_keywords']
 
 # We lemmatize and stem words to homogenize similar content as much as possible
 lemmer = WordNetLemmatizer().lemmatize
@@ -31,14 +33,23 @@ def _homogenize(word):
 # End _homogenize()
 
 
-def _ensure_df(corpora):
+def _ensure_df(corpora, **kwargs):
+    """Convert RecordCollection to DataFrame for analysis purposes with additional checks.
+
+    Parameters
+    ==========
+    * corpora : object, representing corpora
+    """
     if 'metaknowledge' in str(type(corpora)).lower():
+        cols = ["AU", "SO", "DE", "DOI"]
         try:
-            corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"],
-                                                     stemmer=_homogenize))
+            corpora_df = rc_to_df(corpora, extra_cols=cols, stemmer=_homogenize, **kwargs)
+            # corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"],
+            #                                          stemmer=_homogenize, **kwargs))
         except BadZipfile:
             warnings.warn("Could not stem/lemmatize content - set up NLTK WordNet data first!")
-            corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"]))
+            corpora_df = rc_to_df(corpora, extra_cols=cols, **kwargs)
+            # corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"]))
     elif 'dataframe' in str(type(corpora)).lower():
         corpora_df = corpora
 
@@ -50,7 +61,7 @@ def _remove_match(corpora, strings, col_name, verbose=True):
     corpora_df = _ensure_df(corpora)
 
     for unrelated in strings:
-        matched = corpora_df.loc[corpora_df[col_name].str.contains(unrelated), :]
+        matched = corpora_df.loc[corpora_df[col_name].str.contains(unrelated, case=False), :]
         count_removed = matched['id'].count()
 
         if verbose:
@@ -62,13 +73,13 @@ def _remove_match(corpora, strings, col_name, verbose=True):
 # End _remove_match()
 
 
-def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, verbose=True):
+def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, cache_as=None, verbose=True):
     """Using one of several approaches, try to identify topics to help constrain search space.
 
     Parameters
     ==========
-    * corpora_df : Pandas DataFrame, Corpora derived from Metaknowledge RecordCollection
-    * model_type : str, name of topic modeling approach
+    * corpora_df : DataFrame or RecordCollection, representing the corpora apply topic modeling to
+    * model_type : str, name of topic modeling approach. Defaults to Non-negative Matrix Factorization
     * num_topics : int, attempt to sort documents into this number of topics
     * num_features : int, essentially the maximum number of words to consider per corpus
     * verbose : bool, print out information or not. Default to True
@@ -77,6 +88,15 @@ def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, ver
     ==========
     * tuple, TopicResults object
     """
+    # Load previously cached results if available
+    if cache_as and os.path.isfile(cache_as):
+        # Cache topic model results
+        with open(cache_as, 'rb') as fn:
+            res = pickle.load(fn)
+            if verbose:
+                res.display_topics()
+        return res
+
     corpora_df = _ensure_df(corpora)
 
     combined_kws = corpora_df['DE'].str.split("|").tolist()
@@ -95,6 +115,12 @@ def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, ver
 
     if verbose:
         res.display_topics()
+
+    if cache_as:
+        # Cache topic model results
+        with open(cache_as, 'wb') as fn:
+            pickle.dump(res, fn)
+
 
     return res
 
@@ -197,7 +223,7 @@ def remove_by_keywords(corpora, unrelated_keywords, verbose=True):
 # End remove_by_keywords()
 
 
-def remove_empty_DOIs(corpora, return_removed=False, verbose=True):
+def remove_empty_DOIs(corpora, verbose=True):
     """Remove records with no associated DOI from DataFrame.
 
     Parameters
@@ -222,6 +248,17 @@ def remove_empty_DOIs(corpora, return_removed=False, verbose=True):
 # End remove_empty_DOIs()
 
 
+def remove_duplicates(corpora, verbose=True):
+    corpora_df = _ensure_df(corpora)
+    to_be_removed = corpora_df[corpora_df.DOI.duplicated()]
+    if verbose:
+        count_dups = to_be_removed['DOI'].count()
+        print("Removing {} duplicated records (identical DOIs)".format(count_dups))
+
+    filtered = corpora_df[~corpora_df.DOI.duplicated()]
+
+    return filtered, to_be_removed
+
 def find_rake_phrases(corpora, min_len=2, max_len=None, lang='english'):
     """Find interesting phrases in given corpora.
     """
@@ -266,7 +303,7 @@ def find_phrases(corpora, top_n=5, verbose=False, weighted_keywords=None):
 
     Parameters
     ==========
-    * corpora : Pandas DataFrame
+    * corpora : DataFrame or RecordCollection
     * top_n : int, number of phrases to display
     * verbose : bool, if True prints text, document title and top `n` phrases. Defaults to False.
     * weighted_keywords : list or None, if defined give further weight on phrases with given keywords.
@@ -276,6 +313,7 @@ def find_phrases(corpora, top_n=5, verbose=False, weighted_keywords=None):
     * dict, results with DOI has main key, human readable document title and DOI as sub-keys and identified phrases as elements
     """
     if 'metaknowledge' in str(type(corpora)).lower():
+        # Does not use _ensure_df as complete sentences are desired
         corpora = rc_to_df(corpora, removeNumbers=False)
 
     ccc = corpora['abstract'].tolist()
@@ -293,8 +331,9 @@ def find_phrases(corpora, top_n=5, verbose=False, weighted_keywords=None):
             print(doc_title)
 
         if len(corpus) == 0:
-            print("    No abstract for {}, skipping...\n".format(doc_title))
-            return None
+            if verbose:
+                print("    No abstract for {}, skipping...\n".format(doc_title))
+            continue
 
         for idx, sent in enumerate(sent_tokenize_list):
             split_sent = sent.split(" ")

diff --git a/wosis/analysis/plotting.py b/wosis/analysis/plotting.py
@@ -27,7 +27,7 @@ def wrapper(*args, **kwargs):
 
         if save_plot_fn:
             if save_plot_fn.endswith('.png'):
-                save_plot_fn = save_plot_fn.strip('.png')
+                save_plot_fn = save_plot_fn.replace('.png', '')
 
             fig.savefig(save_plot_fn + '.png', format='png', 
                         dpi=300, bbox_inches='tight')
@@ -159,7 +159,7 @@ def plot_kw_trend(search_results, title=None, no_log_scale=False):
 
     See Also
     ==========
-    * wosis.analysis.search.search_records()
+    * wosis.analysis.search.find_keywords()
 
     Returns
     ==========
@@ -238,15 +238,25 @@ def plot_kw_trend(search_results, title=None, no_log_scale=False):
 
 
 @plot_saver
-def plot_pub_per_kw(ind_recs, summary, corpora, kw_category, annotate=False):
+def plot_pub_per_kw(kw_matches, corpora, kw_category, annotate=False):
     """Plot publications per keyword.
 
     Parameters
     ==========
-    * ind_recs : dict, of keywords and matching publication records
-    * summary : dict, of keywords and matching number of publications
+    * kw_matches : KeywordMatch object
     * corpora : Metaknowledge Collection, representing corpora
     * kw_category : str, text indicating keyword category for use in plot title
+    * annotate : bool, display number of records in plot
+
+    Example
+    ==========
+    ```python
+    # where RC is some RecordCollection
+    keywords = set(["software practice", "software development", "software engineering", 
+                "best practice", "modeling practice"])
+    matches = wosis.keyword_matches(RC, keywords, 95.0)
+    wos_plot.plot_pub_per_kw(matches, RC, 'Practices')
+    ```
 
     See Also
     ==========
@@ -257,6 +267,7 @@ def plot_pub_per_kw(ind_recs, summary, corpora, kw_category, annotate=False):
     ==========
     * matplotlib figure object
     """
+    ind_recs, summary = kw_matches.recs, kw_matches.summary
     unique_titles = get_unique_kw_titles(ind_recs)
     num_titles = len(unique_titles)
     top_title = "Num. Publications per {} Keyword".format(kw_category.title())
@@ -374,6 +385,9 @@ def plot_journal_pub_trend(search_results, title='Journal Publication Trend', to
     pubs_across_time = pubs_across_time.fillna(0.0).transpose()
     pubs_across_time = pubs_across_time.sort_index()
 
+    # Reorder based on total publications
+    pubs_across_time = pubs_across_time[top_n_journals.index]
+
     axes = pubs_across_time.plot(subplots=True, figsize=(
         12, 10), layout=(top_n, 1), sharey=True, legend=False)
 

diff --git a/wosis/analysis/pub_details.py b/wosis/analysis/pub_details.py
@@ -74,11 +74,11 @@ def link_to_pub(records):
     ==========
     * Pandas DataFrame with additional column ('DOI link')
     """
-    if 'metaknowledge' in str(type(records)):
+    if 'metaknowledge' in str(type(records)).lower():
         recs = records.forNLP(extraColumns=["AU", "SO", "DE", 'DOI'], lower=False, removeNonWords=False)
         df = pd.DataFrame(recs)
-    elif 'DataFrame' in str(type(records)):
-        df = records
+    elif 'dataframe' in str(type(records)).lower():
+        df = records.copy()
     # End if
 
     df.loc[df['DOI'] != '', 'DOI link'] = "https://dx.doi.org/" + df.loc[df['DOI'] != '', 'DOI'].astype(str)

diff --git a/wosis/analysis/search.py b/wosis/analysis/search.py
@@ -6,8 +6,19 @@
 
 from wosis.KeywordMatch import KeywordMatch
 
+import warnings
 
 def search_records(records, keywords, threshold=60.0):
+    """Deprecated function: Search records for a given set of keywords.
+
+    Use `find_keywords()` instead.
+    """
+    warnings.warn("Deprecated function! Use `find_keywords()` instead")
+    return find_keywords(records, keywords, threshold=60.0)
+# End search_records()
+
+
+def find_keywords(records, keywords, threshold=60.0):
     """Search records for a given set of keywords.
 
     Keywords will be transformed to lower case.
@@ -73,7 +84,7 @@ def search_records(records, keywords, threshold=60.0):
     matches.name = '{}'.format(keywords)
 
     return matches
-# End search_records()
+# End find_keywords()
 
 
 def keyword_matches(records, keywords, threshold=60.0):
@@ -89,9 +100,12 @@ def keyword_matches(records, keywords, threshold=60.0):
     ==========
     * dict, matching records by keyword
     """
+    if isinstance(keywords, str):
+        keywords = [keywords, ]
+
     matching_records = {}
     for kw in keywords:
-        matching_records[kw] = search_records(records, set([kw, ]), threshold)
+        matching_records[kw] = find_keywords(records, set([kw, ]), threshold)
     # End for
 
     return KeywordMatch(matching_records)
@@ -118,7 +132,7 @@ def keyword_matches_by_criteria(records, keyword_criteria, threshold=60.0):
     criteria_matches = {}
     for criteria in list(keyword_criteria):
         criteria_kws = keyword_criteria[criteria]
-        search_results = search_records(
+        search_results = find_keywords(
             records, criteria_kws, threshold=threshold)
         kw_match = keyword_matches(search_results, criteria_kws, threshold)
 
@@ -240,9 +254,43 @@ def find_pubs_by_title(records, titles):
         # titles is a string, convert to list
         titles = [titles]
 
+    titles = set(titles)
+
+    new_rc = mk.RecordCollection()
+    for rec in records:
+        curr_doi = rec.get('DI')
+        if rec.title in titles and not new_rc.containsID(curr_doi):
+            new_rc.add(rec)
+
+    if len(new_rc) == 0:
+        return None
+
+    return new_rc
+# End find_pubs_by_title()
+
+
+def find_pubs_by_doi(records, dois):
+    """Find publications by title.
+
+    Parameters
+    ==========
+    * records : Metaknowledge RecordCollection
+    * titles : list, of titles to search for (has to be exact match)
+
+    Returns
+    ==========
+    * Metaknowledge RecordCollection or None if no matches found
+    """
+    if hasattr(dois, 'lower'):
+        # titles is a string, convert to list
+        dois = [dois]
+
+    dois = set(dois)
+
     new_rc = mk.RecordCollection()
     for rec in records:
-        if rec.title in titles:
+        curr_doi = rec.get('DI')
+        if curr_doi in dois and not new_rc.containsID(curr_doi):
             new_rc.add(rec)
 
     if len(new_rc) == 0: