Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
ConnectedSystems committed Apr 19, 2019
2 parents b767779 + 8d2341d commit a992186
Show file tree
Hide file tree
Showing 10 changed files with 208 additions and 57 deletions.
11 changes: 2 additions & 9 deletions wosis/TopicResult.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,6 @@ def find_paper_by_id(self, wos_id):
==========
* DataFrame of the matching topic, or None if not found.
"""
# tmp_df = self.corpora_df
# tmp_df.loc[tmp_df.id == wos_id]

for i in range(self.num_topics):
topic_id = i + 1
tmp_topic = self.get_topic_by_id(topic_id)
Expand All @@ -85,6 +82,7 @@ def find_paper_by_id(self, wos_id):
return tmp_topic
# End if
# End for
# End find_paper_by_id

def find_paper_by_doi(self, doi):
"""Search for a given record based on its DOI
Expand All @@ -97,9 +95,6 @@ def find_paper_by_doi(self, doi):
==========
* DataFrame of the matching topic, or None if not found.
"""
# tmp_df = self.corpora_df
# tmp_df.loc[tmp_df.DOI == doi]

for i in range(self.num_topics):
topic_id = i + 1
tmp_topic = self.get_topic_by_id(topic_id)
Expand All @@ -109,6 +104,4 @@ def find_paper_by_doi(self, doi):
return tmp_topic
# End if
# End for


# End find_paper_by_id()
# End find_paper_by_doi()
3 changes: 2 additions & 1 deletion wosis/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .search import *
from .pub_details import *
from .constrain import *
from .constrain import *
from .stats import *
65 changes: 52 additions & 13 deletions wosis/analysis/constrain.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import pandas as pd
import pickle

import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
Expand All @@ -19,7 +21,7 @@
from zipfile import BadZipfile

__all__ = ['find_topics', 'find_phrases', 'remove_by_journals',
'remove_by_title', 'remove_empty_DOIs', 'remove_by_keywords']
'remove_by_title', 'remove_empty_DOIs', 'remove_duplicates', 'remove_by_keywords']

# We lemmatize and stem words to homogenize similar content as much as possible
lemmer = WordNetLemmatizer().lemmatize
Expand All @@ -31,14 +33,23 @@ def _homogenize(word):
# End _homogenize()


def _ensure_df(corpora):
def _ensure_df(corpora, **kwargs):
"""Convert RecordCollection to DataFrame for analysis purposes with additional checks.
Parameters
==========
* corpora : object, representing corpora
"""
if 'metaknowledge' in str(type(corpora)).lower():
cols = ["AU", "SO", "DE", "DOI"]
try:
corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"],
stemmer=_homogenize))
corpora_df = rc_to_df(corpora, extra_cols=cols, stemmer=_homogenize, **kwargs)
# corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"],
# stemmer=_homogenize, **kwargs))
except BadZipfile:
warnings.warn("Could not stem/lemmatize content - set up NLTK WordNet data first!")
corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"]))
corpora_df = rc_to_df(corpora, extra_cols=cols, **kwargs)
# corpora_df = pd.DataFrame(corpora.forNLP(extraColumns=["AU", "SO", "DE"]))
elif 'dataframe' in str(type(corpora)).lower():
corpora_df = corpora

Expand All @@ -50,7 +61,7 @@ def _remove_match(corpora, strings, col_name, verbose=True):
corpora_df = _ensure_df(corpora)

for unrelated in strings:
matched = corpora_df.loc[corpora_df[col_name].str.contains(unrelated), :]
matched = corpora_df.loc[corpora_df[col_name].str.contains(unrelated, case=False), :]
count_removed = matched['id'].count()

if verbose:
Expand All @@ -62,13 +73,13 @@ def _remove_match(corpora, strings, col_name, verbose=True):
# End _remove_match()


def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, verbose=True):
def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, cache_as=None, verbose=True):
"""Using one of several approaches, try to identify topics to help constrain search space.
Parameters
==========
* corpora_df : Pandas DataFrame, Corpora derived from Metaknowledge RecordCollection
* model_type : str, name of topic modeling approach
* corpora_df : DataFrame or RecordCollection, representing the corpora apply topic modeling to
* model_type : str, name of topic modeling approach. Defaults to Non-negative Matrix Factorization
* num_topics : int, attempt to sort documents into this number of topics
* num_features : int, essentially the maximum number of words to consider per corpus
* verbose : bool, print out information or not. Default to True
Expand All @@ -77,6 +88,15 @@ def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, ver
==========
* tuple, TopicResults object
"""
# Load previously cached results if available
if cache_as and os.path.isfile(cache_as):
# Cache topic model results
with open(cache_as, 'rb') as fn:
res = pickle.load(fn)
if verbose:
res.display_topics()
return res

corpora_df = _ensure_df(corpora)

combined_kws = corpora_df['DE'].str.split("|").tolist()
Expand All @@ -95,6 +115,12 @@ def find_topics(corpora, model_type='NMF', num_topics=10, num_features=1000, ver

if verbose:
res.display_topics()

if cache_as:
# Cache topic model results
with open(cache_as, 'wb') as fn:
pickle.dump(res, fn)


return res

Expand Down Expand Up @@ -197,7 +223,7 @@ def remove_by_keywords(corpora, unrelated_keywords, verbose=True):
# End remove_by_keywords()


def remove_empty_DOIs(corpora, return_removed=False, verbose=True):
def remove_empty_DOIs(corpora, verbose=True):
"""Remove records with no associated DOI from DataFrame.
Parameters
Expand All @@ -222,6 +248,17 @@ def remove_empty_DOIs(corpora, return_removed=False, verbose=True):
# End remove_empty_DOIs()


def remove_duplicates(corpora, verbose=True):
corpora_df = _ensure_df(corpora)
to_be_removed = corpora_df[corpora_df.DOI.duplicated()]
if verbose:
count_dups = to_be_removed['DOI'].count()
print("Removing {} duplicated records (identical DOIs)".format(count_dups))

filtered = corpora_df[~corpora_df.DOI.duplicated()]

return filtered, to_be_removed

def find_rake_phrases(corpora, min_len=2, max_len=None, lang='english'):
"""Find interesting phrases in given corpora.
"""
Expand Down Expand Up @@ -266,7 +303,7 @@ def find_phrases(corpora, top_n=5, verbose=False, weighted_keywords=None):
Parameters
==========
* corpora : Pandas DataFrame
* corpora : DataFrame or RecordCollection
* top_n : int, number of phrases to display
* verbose : bool, if True prints text, document title and top `n` phrases. Defaults to False.
* weighted_keywords : list or None, if defined give further weight on phrases with given keywords.
Expand All @@ -276,6 +313,7 @@ def find_phrases(corpora, top_n=5, verbose=False, weighted_keywords=None):
* dict, results with DOI has main key, human readable document title and DOI as sub-keys and identified phrases as elements
"""
if 'metaknowledge' in str(type(corpora)).lower():
# Does not use _ensure_df as complete sentences are desired
corpora = rc_to_df(corpora, removeNumbers=False)

ccc = corpora['abstract'].tolist()
Expand All @@ -293,8 +331,9 @@ def find_phrases(corpora, top_n=5, verbose=False, weighted_keywords=None):
print(doc_title)

if len(corpus) == 0:
print(" No abstract for {}, skipping...\n".format(doc_title))
return None
if verbose:
print(" No abstract for {}, skipping...\n".format(doc_title))
continue

for idx, sent in enumerate(sent_tokenize_list):
split_sent = sent.split(" ")
Expand Down
24 changes: 19 additions & 5 deletions wosis/analysis/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def wrapper(*args, **kwargs):

if save_plot_fn:
if save_plot_fn.endswith('.png'):
save_plot_fn = save_plot_fn.strip('.png')
save_plot_fn = save_plot_fn.replace('.png', '')

fig.savefig(save_plot_fn + '.png', format='png',
dpi=300, bbox_inches='tight')
Expand Down Expand Up @@ -159,7 +159,7 @@ def plot_kw_trend(search_results, title=None, no_log_scale=False):
See Also
==========
* wosis.analysis.search.search_records()
* wosis.analysis.search.find_keywords()
Returns
==========
Expand Down Expand Up @@ -238,15 +238,25 @@ def plot_kw_trend(search_results, title=None, no_log_scale=False):


@plot_saver
def plot_pub_per_kw(ind_recs, summary, corpora, kw_category, annotate=False):
def plot_pub_per_kw(kw_matches, corpora, kw_category, annotate=False):
"""Plot publications per keyword.
Parameters
==========
* ind_recs : dict, of keywords and matching publication records
* summary : dict, of keywords and matching number of publications
* kw_matches : KeywordMatch object
* corpora : Metaknowledge Collection, representing corpora
* kw_category : str, text indicating keyword category for use in plot title
* annotate : bool, display number of records in plot
Example
==========
```python
# where RC is some RecordCollection
keywords = set(["software practice", "software development", "software engineering",
"best practice", "modeling practice"])
matches = wosis.keyword_matches(RC, keywords, 95.0)
wos_plot.plot_pub_per_kw(matches, RC, 'Practices')
```
See Also
==========
Expand All @@ -257,6 +267,7 @@ def plot_pub_per_kw(ind_recs, summary, corpora, kw_category, annotate=False):
==========
* matplotlib figure object
"""
ind_recs, summary = kw_matches.recs, kw_matches.summary
unique_titles = get_unique_kw_titles(ind_recs)
num_titles = len(unique_titles)
top_title = "Num. Publications per {} Keyword".format(kw_category.title())
Expand Down Expand Up @@ -374,6 +385,9 @@ def plot_journal_pub_trend(search_results, title='Journal Publication Trend', to
pubs_across_time = pubs_across_time.fillna(0.0).transpose()
pubs_across_time = pubs_across_time.sort_index()

# Reorder based on total publications
pubs_across_time = pubs_across_time[top_n_journals.index]

axes = pubs_across_time.plot(subplots=True, figsize=(
12, 10), layout=(top_n, 1), sharey=True, legend=False)

Expand Down
6 changes: 3 additions & 3 deletions wosis/analysis/pub_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ def link_to_pub(records):
==========
* Pandas DataFrame with additional column ('DOI link')
"""
if 'metaknowledge' in str(type(records)):
if 'metaknowledge' in str(type(records)).lower():
recs = records.forNLP(extraColumns=["AU", "SO", "DE", 'DOI'], lower=False, removeNonWords=False)
df = pd.DataFrame(recs)
elif 'DataFrame' in str(type(records)):
df = records
elif 'dataframe' in str(type(records)).lower():
df = records.copy()
# End if

df.loc[df['DOI'] != '', 'DOI link'] = "https://dx.doi.org/" + df.loc[df['DOI'] != '', 'DOI'].astype(str)
Expand Down
56 changes: 52 additions & 4 deletions wosis/analysis/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@

from wosis.KeywordMatch import KeywordMatch

import warnings

def search_records(records, keywords, threshold=60.0):
"""Deprecated function: Search records for a given set of keywords.
Use `find_keywords()` instead.
"""
warnings.warn("Deprecated function! Use `find_keywords()` instead")
return find_keywords(records, keywords, threshold=60.0)
# End search_records()


def find_keywords(records, keywords, threshold=60.0):
"""Search records for a given set of keywords.
Keywords will be transformed to lower case.
Expand Down Expand Up @@ -73,7 +84,7 @@ def search_records(records, keywords, threshold=60.0):
matches.name = '{}'.format(keywords)

return matches
# End search_records()
# End find_keywords()


def keyword_matches(records, keywords, threshold=60.0):
Expand All @@ -89,9 +100,12 @@ def keyword_matches(records, keywords, threshold=60.0):
==========
* dict, matching records by keyword
"""
if isinstance(keywords, str):
keywords = [keywords, ]

matching_records = {}
for kw in keywords:
matching_records[kw] = search_records(records, set([kw, ]), threshold)
matching_records[kw] = find_keywords(records, set([kw, ]), threshold)
# End for

return KeywordMatch(matching_records)
Expand All @@ -118,7 +132,7 @@ def keyword_matches_by_criteria(records, keyword_criteria, threshold=60.0):
criteria_matches = {}
for criteria in list(keyword_criteria):
criteria_kws = keyword_criteria[criteria]
search_results = search_records(
search_results = find_keywords(
records, criteria_kws, threshold=threshold)
kw_match = keyword_matches(search_results, criteria_kws, threshold)

Expand Down Expand Up @@ -240,9 +254,43 @@ def find_pubs_by_title(records, titles):
# titles is a string, convert to list
titles = [titles]

titles = set(titles)

new_rc = mk.RecordCollection()
for rec in records:
curr_doi = rec.get('DI')
if rec.title in titles and not new_rc.containsID(curr_doi):
new_rc.add(rec)

if len(new_rc) == 0:
return None

return new_rc
# End find_pubs_by_title()


def find_pubs_by_doi(records, dois):
"""Find publications by title.
Parameters
==========
* records : Metaknowledge RecordCollection
* titles : list, of titles to search for (has to be exact match)
Returns
==========
* Metaknowledge RecordCollection or None if no matches found
"""
if hasattr(dois, 'lower'):
# titles is a string, convert to list
dois = [dois]

dois = set(dois)

new_rc = mk.RecordCollection()
for rec in records:
if rec.title in titles:
curr_doi = rec.get('DI')
if curr_doi in dois and not new_rc.containsID(curr_doi):
new_rc.add(rec)

if len(new_rc) == 0:
Expand Down
Loading

0 comments on commit a992186

Please sign in to comment.