-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
80 lines (71 loc) · 2.94 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import requests
import pymongo
from webdriver_manager.chrome import ChromeDriverManager
def init_browser():
executable_path = {'executable_path': ChromeDriverManager().install()}
return Browser('chrome', **executable_path, headless=False)
def scrape():
browser = init_browser()
mars_dict ={}
# Mars News URL of page to be scraped
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)
html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')
news_title = news_soup.find_all('div', class_='content_title')[0].text
news_p = news_soup.find_all('div', class_='article_teaser_body')[0].text
# Mars Image to be scraped
jpl_nasa_url = 'https://www.jpl.nasa.gov'
images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(images_url)
html = browser.html
images_soup = BeautifulSoup(html, 'html.parser')
relative_image_path = images_soup.find_all('img')[3]["src"]
featured_image_url = relative_image_path
# Mars facts to be scraped, converted into html table
facts_url = 'https://space-facts.com/mars/'
tables = pd.read_html(facts_url)
mars_facts_df = tables[2]
mars_facts_df.columns = ["Description", "Value"]
mars_html_table = mars_facts_df.to_html()
mars_html_table.replace('\n', '')
# Mars hemisphere name and image to be scraped
usgs_url = 'https://astrogeology.usgs.gov'
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)
hemispheres_html = browser.html
hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')
# Mars hemispheres products data
all_mars_hemispheres = hemispheres_soup.find('div', class_='collapsible results')
mars_hemispheres = all_mars_hemispheres.find_all('div', class_='item')
hemisphere_image_urls = []
# Sift through hemisphere data
for i in mars_hemispheres:
# Collect Title
hemisphere = i.find('div', class_="description")
title = hemisphere.h3.text
# Collect image link
hemisphere_link = hemisphere.a["href"]
browser.visit(usgs_url + hemisphere_link)
image_html = browser.html
image_soup = BeautifulSoup(image_html, 'html.parser')
image_link = image_soup.find('div', class_='downloads')
image_url = image_link.find('li').a['href']
# Create Dictionary
image_dict = {}
image_dict['title'] = title
image_dict['img_url'] = image_url
hemisphere_image_urls.append(image_dict)
# Mars
mars_dict = {
"news_title": news_title,
"news_p": news_p,
"featured_image_url": featured_image_url,
"fact_table": str(mars_html_table),
"hemisphere_images": hemisphere_image_urls
}
return mars_dict