From b87641bf0b7a9693838b59e3699368c2a547f129 Mon Sep 17 00:00:00 2001 From: Faris Hijazi Date: Wed, 8 Apr 2020 17:24:02 +0300 Subject: [PATCH] fix #280: now parses new google response format, _get_AF_initDataCallback() the google page contains info in a script variable `AF_initDataCallback` See the javascript that parses it: https://gist.github.com/FarisHijazi/6c9ba3fb315d0ce9bfa62c10dfa8b2f8 This commit is an implementation to this code.fix-2020-format I have added an iterator that returns rg_meta objects --- .../google_images_download.py | 108 +++++++++++++++++- 1 file changed, 104 insertions(+), 4 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index fd89a3a9..85e0e6e0 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -125,7 +125,10 @@ def user_input(): class googleimagesdownload: def __init__(self): - pass + # this iterator contains the parsed info contained in the `AF_initDataCallback` script (2020 format) + # it can then be called in _get_next_item() + # NOTE: never access this directly, always use self._get_AF_initDataCallback() + self._info_AF_initDataCallback = None # Downloading entire Web Document (Raw Page Content) def download_page(self,url): @@ -714,9 +717,18 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri def _get_next_item(self,s): start_line = s.find('rg_meta notranslate') if start_line == -1: # If no links are found then give an error! - end_quote = 0 - link = "no_links" - return link, end_quote + + try: + data_iter = self._get_AF_initDataCallback(s) + # get the next item + final_object = next(data_iter) + + return final_object, 0 + except StopIteration: + # if StopIteration is raised, break from loop + end_quote = 0 + link = "no_links" + return link, end_quote else: start_line = s.find('class="rg_meta notranslate">') start_object = s.find('{', start_line + 1) @@ -793,6 +805,94 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments): count-1) + " is all we got for this search filter!") return items,errorCount,abs_path + def _get_AF_initDataCallback(self, page): + """ + :param page: html string + :return: self._info_AF_initDataCallback, this is an iterator containing rg_meta objects + """ + + if self._info_AF_initDataCallback is not None: + return self._info_AF_initDataCallback + + import bs4, re, json + + def get_metas(page): + """ + the this works by parsing the info in the page scripts + See the js code in: https://gist.github.com/FarisHijazi/6c9ba3fb315d0ce9bfa62c10dfa8b2f8 + + :returns a list of objects, these contain the image info + + how it works: + there's a