|
| 1 | +#Y Combinator Hacker News Web Scraper |
| 2 | +import requests,lxml |
| 3 | +from bs4 import BeautifulSoup |
| 4 | + |
| 5 | + |
| 6 | +class YCombinator: |
| 7 | + def __init__(self): |
| 8 | + self.hacker_soup = BeautifulSoup(requests.get(url="https://news.ycombinator.com/news").text, "lxml") |
| 9 | + self.articles = [span for span in self.hacker_soup.find_all("span", class_="titleline")] |
| 10 | + |
| 11 | + self.points = [int(span.text.strip("points").strip()) for span in self.hacker_soup.find_all("span", class_="score")] |
| 12 | + self.headlines = [article.find("a", href=True).text for article in self.articles] |
| 13 | + self.links = [article.find("a", href=True)["href"] for article in self.articles] |
| 14 | + #.replace("\u00a0"," ") to replace the \xa0 |
| 15 | + self.sublines = [article.text.strip().replace("\u00a0"," ") for article in self.hacker_soup.find_all("span",class_="subline")] |
| 16 | + |
| 17 | + #On the Y Combinator news they add job positions and this is the code to get extra subline and will assign 0 points for those articles |
| 18 | + extra_subtext = [td.text.strip("\n") for td in self.hacker_soup.find_all("td", class_="subtext") if "comment" not in td.text] |
| 19 | + extra_index = 0 |
| 20 | + for headline in self.headlines: |
| 21 | + if "YC" in headline and "Hiring" in headline: |
| 22 | + index = self.headlines.index(headline) |
| 23 | + self.points.insert(index,0) |
| 24 | + self.sublines.insert(index,extra_subtext[extra_index]) |
| 25 | + extra_index += 1 |
| 26 | + |
| 27 | + # This will find how many comments there are for each article |
| 28 | + self.comments = [] |
| 29 | + for subline in self.sublines: |
| 30 | + splits = [split.strip() for split in subline.split("|")] |
| 31 | + if "comments" in splits[-1]: |
| 32 | + last_item = int(splits[-1].strip("comments").strip()) |
| 33 | + self.comments.append(last_item) |
| 34 | + elif "comment" in splits[-1]: |
| 35 | + last_item = int(splits[-1].strip("comment").strip()) |
| 36 | + self.comments.append(last_item) |
| 37 | + else: |
| 38 | + self.comments.append(0) |
| 39 | + |
| 40 | + |
| 41 | + #This method will show all the top 30 current headlines on the Website |
| 42 | + def show_all(self): |
| 43 | + for i in range(len(self.headlines)): |
| 44 | + headline = self.headlines[i] |
| 45 | + link = self.links[i] |
| 46 | + points = self.points[i] |
| 47 | + print(f"{i+1}. {headline}") |
| 48 | + print(link) |
| 49 | + print(self.sublines[i]) |
| 50 | + print() |
| 51 | + |
| 52 | + #This method will show the news with the most points |
| 53 | + def most_points(self): |
| 54 | + greatest = max(self.points) |
| 55 | + index = self.points.index(greatest) |
| 56 | + print(f"{self.headlines[index]}") |
| 57 | + print(self.links[index]) |
| 58 | + print(self.sublines[index]) |
| 59 | + print() |
| 60 | + |
| 61 | + # This method will show the article with the most comments |
| 62 | + def most_comments(self): |
| 63 | + greatest = max(self.comments) |
| 64 | + index = self.comments.index(greatest) |
| 65 | + print(f"{self.headlines[index]}") |
| 66 | + print(self.links[index]) |
| 67 | + print(self.sublines[index]) |
| 68 | + print() |
| 69 | + |
| 70 | + #This method will show the top headline |
| 71 | + def show_first(self): |
| 72 | + print(f"{self.headlines[0]}") |
| 73 | + print(self.links[0]) |
| 74 | + print(self.sublines[0]) |
| 75 | + print() |
| 76 | + |
| 77 | + |
| 78 | + |
| 79 | + |
| 80 | +if __name__ == "__main__": |
| 81 | + yc = YCombinator() |
| 82 | + yc.show_all() |
| 83 | + yc.show_first() |
| 84 | + yc.most_points() |
| 85 | + yc.most_comments() |
0 commit comments