Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(google): search #216

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ from jobspy import scrape_jobs
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
Expand Down Expand Up @@ -65,6 +66,9 @@ Optional
| (default is all)
├── search_term (str)
|
├── google_search_term (str)
| search term for google jobs. This is is only param for filtering google jobs.
├── location (str)
Expand Down Expand Up @@ -171,9 +175,9 @@ Indeed specific

## Supported Countries for Job Searching

### **LinkedIn / Google**
### **LinkedIn**

LinkedIn & Google searches globally & uses only the `location` parameter.
LinkedIn searches globally & uses only the `location` parameter.

### **ZipRecruiter**

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.74"
version = "1.1.75"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
8 changes: 6 additions & 2 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
google_search_term: str | None = None,
location: str | None = None,
distance: int | None = 50,
is_remote: bool = False,
Expand Down Expand Up @@ -86,6 +87,7 @@ def get_site_type():
site_type=get_site_type(),
country=country_enum,
search_term=search_term,
google_search_term=google_search_term,
location=location,
distance=distance,
is_remote=is_remote,
Expand Down Expand Up @@ -216,8 +218,8 @@ def convert_to_annual(job_data: dict):
"title",
"company",
"location",
"job_type",
"date_posted",
"job_type",
"salary_source",
"interval",
"min_amount",
Expand Down Expand Up @@ -248,6 +250,8 @@ def convert_to_annual(job_data: dict):
jobs_df = jobs_df[desired_order]

# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
return jobs_df.sort_values(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
else:
return pd.DataFrame()
1 change: 1 addition & 0 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class SalarySource(Enum):
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None

location: str | None = None
country: Country | None = Country.USA
Expand Down
131 changes: 83 additions & 48 deletions src/jobspy/scrapers/google/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,14 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
forward_cursor = self._get_initial_cursor()
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
logger.error("initial cursor not found")
return JobResponse(jobs=[])
logger.warning(
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)

page = 1
job_list: list[JobPost] = []

while (
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
Expand All @@ -74,7 +75,11 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
logger.error(f"failed to get jobs on page: {page}, {e}")
break
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
Expand All @@ -87,8 +92,8 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
]
)

def _get_initial_cursor(self):
"""Gets initial cursor to paginate through job listings"""
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
"""Gets initial cursor and jobs to paginate through job listings"""
query = f"{self.scraper_input.search_term} jobs"

def get_time_range(hours_old):
Expand Down Expand Up @@ -121,13 +126,22 @@ def get_time_range(hours_old):
if self.scraper_input.is_remote:
query += " remote"

if self.scraper_input.google_search_term:
query = self.scraper_input.google_search_term

params = {"q": query, "udm": "8"}
response = self.session.get(self.url, headers=headers_initial, params=params)

pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
return data_async_fc
jobs_raw = self._find_job_info_initial_page(response.text)
jobs = []
for job_raw in jobs_raw:
job_post = self._parse_job(job_raw)
if job_post:
jobs.append(job_post)
return data_async_fc, jobs

def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
Expand All @@ -147,55 +161,55 @@ def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
match_fc = re.search(pattern_fc, job_data)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_on_page = []

for array in parsed:

_, job_data = array
if not job_data.startswith("[[["):
continue
job_d = json.loads(job_data)

job_info = self._find_job_info(job_d)

job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
continue
self.seen_urls.add(job_url)

title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]

days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()

description = job_info[19]

job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
job_url_direct=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower()
or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
jobs_on_page.append(job_post)
job_post = self._parse_job(job_info)
if job_post:
jobs_on_page.append(job_post)
return jobs_on_page, data_async_fc

def _parse_job(self, job_info: list):
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)

title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]

days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()

description = job_info[19]

job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower() or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
return job_post

@staticmethod
def _find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
Expand All @@ -213,3 +227,24 @@ def _find_job_info(jobs_data: list | dict) -> list | None:
if result:
return result
return None

@staticmethod
def _find_job_info_initial_page(html_text: str):
pattern = (
f'520084652":('
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
)
results = []
matches = re.finditer(pattern, html_text)

import json

for match in matches:
try:
parsed_data = json.loads(match.group(1))
results.append(parsed_data)

except json.JSONDecodeError as e:
logger.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results
Loading