-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetching data.py
43 lines (32 loc) · 913 Bytes
/
fetching data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
from bs4 import BeautifulSoup
def fetchdata(url,path):
r=requests.get(url)
with open(path,"w") as f:
f.write(r.text)
#The next line is used to provide a proxy server.
pro = requests.get('proxy server link')
url='link of the targeted site'
fetchdata(url,"sample.html")
#We store the fetched data in sample.html file
with open("sample.html",'r') as f:
html_doc=f.read()
soup = BeautifulSoup(html_doc,'html.parser')
#code
print(soup.prettify())
#title
print("\nTITLE \t TYPE")
print(soup.title.string,"\t",type(soup.title.string))
#links and texts
l1=[]
l2=[]
for link in soup.find_all('a'):
l1.append(link.get('href'))
l2.append(link.get_text())
#link ids
l=[]
for i in soup.find_all('a'):
l.append(i.get('id'))
print("\nLINKS \t LINK ID\t Text\n")
for i in range(0,len(l)):
print(l1[i],"\t",l[i],"\t",l2[i])