Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions JibinCB/authors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pypdf
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_names_from_pdf(pdf_path):
names = []
with open(pdf_path, "rb") as pdf_file:
pdf_reader = pypdf.PdfReader(pdf_file)
page = pdf_reader.pages[0]
text = page.extract_text()
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "PERSON":
names.append(ent.text)
return names

# Example usage
def authors():
pdf_path = "paper1.pdf"
names_list = extract_names_from_pdf(pdf_path)
return(names_list)


Binary file added JibinCB/letter.pdf
Binary file not shown.
17 changes: 17 additions & 0 deletions JibinCB/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd
from summary import summary_of_pdf
from authors import authors
from title_of_pdf import title
# Load the existing Excel file into a DataFrame

# Sample new data
new_data = [
{'Title': title(), 'Authors': authors(), 'Summary': summary_of_pdf()}
]
df = pd.DataFrame(new_data)
# Append new data to the DataFrame
# for item in new_data:
# df = df.append(item, ignore_index=True)

# Save the updated DataFrame to Excel
df.to_excel('sample.xlsx', index=False)
Binary file added JibinCB/paper1.pdf
Binary file not shown.
Binary file added JibinCB/sample.xlsx
Binary file not shown.
36 changes: 36 additions & 0 deletions JibinCB/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pypdf
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"


def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, "rb") as pdf_file:
pdf_reader = pypdf.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text


def summarize_text(text, sentences_count=20):
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
sentences = summarizer(parser.document, sentences_count)
summary = " ".join(str(sentence) for sentence in sentences)
return summary


# Example usage
def summary_of_pdf():
pdf_path = "paper1.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
summary_text = summarize_text(pdf_text)
return(summary_text)
19 changes: 19 additions & 0 deletions JibinCB/title_of_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pypdf
import re

# Open the PDF file
def title():
pdf_file = open('paper1.pdf', 'rb')

# Initialize PDF reader
pdf_reader = pypdf.PdfReader(pdf_file)

# Extract text from the first page
first_page = pdf_reader.get_page(0)
pdf_text_first_page = first_page.extract_text(0)
pdf_file.close()

fully_capitalized_text = re.findall(r'\b[A-Z\s\.\-]+\b', pdf_text_first_page)

title = fully_capitalized_text[0].strip()
return(title)