Skip to content

Commit cda3ed9

Browse files
Add files via upload (#961)
<!-- If this is your first pull request: sign the CLA with this GitHub app: https://cla-assistant.io/renovatebot/renovate --> <!-- Make sure the `Allow edits and access to secrets by maintainers` checkbox is checked on this pull request. --> <!-- Please read https://github.com/renovatebot/renovate/blob/main/.github/contributing.md before you create your pull request.--> ## Changes <!-- Describe what behavior is changed by this PR. --> ## Context <!-- Describe why you're making these changes if it's not already explained in a corresponding issue. --> <!-- If you're closing an existing issue with this pull request, use the keyword Closes #issue_number. --> <!-- If you're referencing an issue with this pull request, put it in a Markdown list like this: - #issue_number. --> ## Documentation (please check one with an [x]) - [ ] I have updated the documentation, or - [ ] No documentation update is required ## How I've tested my work (please select one) I have verified these changes via: - [ ] Code inspection only, or - [ ] Newly added/modified unit tests, or - [ ] No unit tests but ran on a real repository, or - [ ] Both unit tests + ran on a real repository <!-- Do you have any suggestions about this PR template? Edit it here: https://github.com/renovatebot/renovate/edit/main/.github/pull_request_template.md --> <!-- Please do not force push to your PR's branch after you have created your PR, as doing so forces us to review the whole PR again. This makes it harder for us to review your work because we don't know what has changed. --> <!-- PRs will always be squashed by us when we merge your work. Commit as many times as you need in this branch. -->
2 parents 98fc3db + d165334 commit cda3ed9

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#Code to download newspapers from Istanbul University
2+
import webbrowser
3+
4+
import requests,os,lxml
5+
from bs4 import BeautifulSoup
6+
7+
8+
9+
10+
class DownloadIstanbul:
11+
12+
def __init__(self):
13+
14+
main_webpage = "http://nek.istanbul.edu.tr:4444/ekos/GAZETE/index.php#gazete"
15+
soup = BeautifulSoup(requests.get(url=main_webpage).text,"lxml")
16+
self.newspaper_links = [f"http://nek.istanbul.edu.tr:4444/ekos/GAZETE/{link['href']}" for link in soup.find_all("a",href=True,class_="popular-category h-40")]
17+
self.newspaper_names = [span.text for span in soup.find_all("span",class_="caption mb-2 d-block")]
18+
19+
20+
#This method will download all the newspapers
21+
def download_all(self):
22+
23+
for i in range(len(self.newspaper_links)):
24+
25+
newspaper_link = self.newspaper_links[i]
26+
newspaper_name = self.newspaper_names[i]
27+
28+
os.makedirs(newspaper_name)
29+
30+
newspaper_soup = BeautifulSoup(requests.get(url=newspaper_link).text,"lxml")
31+
newspapers = [anchor_tag["href"].replace(" ","%20") for anchor_tag in newspaper_soup.find_all("a") if "pdf" in anchor_tag["href"]]
32+
33+
for newspaper in newspapers:
34+
35+
response = requests.get(url=newspaper)
36+
filename = newspaper.replace("%20"," ").split("/")[-1]
37+
38+
if response.status_code == 200:
39+
with open(f"{newspaper_name}/{filename}","wb") as f:
40+
f.write(response.content)
41+
print(f"{filename} was downloaded")
42+
with open("download_results.txt","a") as f:
43+
f.write(f"{filename} was downloaded\n")
44+
else:
45+
print(f"{newspaper_name} had response status code {response.status_code}")
46+
with open("download_results.txt","a") as f:
47+
f.write(f"{newspaper_name} had response status code {response.status_code}")
48+
49+
#You can use to find this to get the names of the newspapers
50+
#it will really useful for the download_newspaper methoad
51+
#You can just the copy paste the newspaper you want to download for the newspaper_name input
52+
def show_newspaper_list(self):
53+
print(self.newspaper_names)
54+
55+
#This method downloads all of a newspaper
56+
def download_newspaper(self,newspaper_name:str):
57+
58+
if newspaper_name in self.newspaper_names:
59+
60+
index = self.newspaper_names.index(newspaper_name)
61+
newspaper_link = self.newspaper_links[index]
62+
63+
64+
os.makedirs(newspaper_name)
65+
66+
newspaper_soup = BeautifulSoup(requests.get(url=newspaper_link).text, "lxml")
67+
newspapers = [anchor_tag["href"].replace(" ", "%20") for anchor_tag in newspaper_soup.find_all("a") if "pdf" in anchor_tag["href"]]
68+
69+
for newspaper in newspapers:
70+
71+
response = requests.get(url=newspaper)
72+
filename = newspaper.replace("%20", " ").split("/")[-1]
73+
74+
if response.status_code == 200:
75+
with open(f"{newspaper_name}/{filename}", "wb") as f:
76+
f.write(response.content)
77+
print(f"{filename} was downloaded")
78+
with open("download_results.txt", "a") as f:
79+
f.write(f"{filename} was downloaded\n")
80+
else:
81+
print(f"{newspaper_name} had response status code {response.status_code}")
82+
with open("download_results.txt", "a") as f:
83+
f.write(f"{newspaper_name} had response status code {response.status_code}")
84+
else:
85+
print(f"Newspaper not found")
86+
87+
88+
89+
90+
if __name__ == "__main__":
91+
92+
di = DownloadIstanbul()
93+
di.show_newspaper_list()
94+
#di.download_newspaper("")
95+
96+
97+

0 commit comments

Comments
 (0)