-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata parsing from Instagram (by using online app).py
190 lines (149 loc) · 6.71 KB
/
data parsing from Instagram (by using online app).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# This script collects photos from the selected account using another online application
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
import requests
import os
import random
import datetime
user_agents=["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
options=webdriver.ChromeOptions()
options.add_argument(f"user-agent={random.choice(user_agents)}")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--headless")
# options.headless=True
driver=webdriver.Chrome(executable_path=r"D:\chromedriver.exe", options=options) #link to web driver
link="https://www.instagram.com/instagram" #link to account (link as in the example)
user_name="" #your login
user_pass="" #your password
def login():
driver.get(url="https://www.instagram.com/accounts/login")
# time.sleep(3)
driver.implicitly_wait(3)
user_name_login=driver.find_element(By.NAME, 'username')
user_name_login.clear()
user_name_login.send_keys(f"{user_name}")
print("login - ok")
user_name_pass=driver.find_element(By.NAME, 'password')
user_name_pass.clear()
user_name_pass.send_keys(f"{user_pass}")
print("password - ok\n")
time.sleep(2)
user_name_pass.send_keys(Keys.ENTER)
#button_click-driver.find_element(By.CLASS_NAME, '').click()
print("authorization was successful!\n")
time.sleep(5)
def create_folder(workspace, folder):
path = os.path.join(workspace, folder)
if not os.path.exists(path):
os.makedirs(path)
print("create folder with path {0}".format(path))
else:
print("folder {0} exists ".format(path))
def main():
try:
login()
driver.get(url=link)
time.sleep(2)
get_username=link.split("/")[-1]
print(f"selected user: {get_username}")
posts=driver.find_element(By.XPATH, "//ul[@class='_aa_7']//child::span").text
posts=int(posts.replace(",", ""))
print(f"total posts: {posts}")
print("\n")
if posts == 0:
print(f"user {get_username} hasn't any posts")
else:
while True:
try:
input_posts=int(input(f"enter the number of posts (from 1 to {posts}): "))
if 1 <= input_posts <= posts:
print(f"the first {input_posts} posts will be downloaded now")
break
else:
print(f"please enter the number of posts (from 1 to {posts}): ")
except:
print("please enter number")
print("checking the folder in current directory...\n")
create_folder(".\instagram", f"{get_username}")
print("downloading posts...\n")
down_page_count=int((input_posts/12)+1)
get_all_elems=[]
while down_page_count>0:
print(f"-----------pages left: {down_page_count}-----------")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# driver.implicitly_wait(2)
time.sleep(2)
collect_links=driver.find_elements(By.XPATH, "//div[@class='_aabd _aa8k _aanf']//child::a")
for collect_link in collect_links:
li=collect_link.get_attribute('href')
get_all_elems.append(li)
down_page_count-=1
if down_page_count==0:
break
print(get_all_elems)
links=[]
len_link=1
get_all_links = []
for el in get_all_elems:
if el not in get_all_links:
get_all_links.append(el)
print(get_all_links)
for get_one_link in get_all_links:
if len_link <= input_posts:
links.append(get_one_link)
len_link+=1
else:
print("all links have been collected\n")
break
print(links)
len_links=int(len(links))
count_img=0
for lin in links:
print(f"-----------------{len_links} links left-----------------")
driver.get(url="https://en.savefrom.net/163/download-from-instagram")
driver.implicitly_wait(3)
search_li=driver.find_element(By.NAME, 'sf_url')
search_li.clear()
search_li.send_keys(f"{lin}")
print("link added\n")
driver.implicitly_wait(3)
search_li.send_keys(Keys.ENTER)
driver.implicitly_wait(5)
src_links=driver.find_elements(By.XPATH, "//div[@class='link-box single']//a")
elem=int(len(src_links))
print(elem)
for src_link in src_links:
sr=src_link.get_attribute("href")
print(sr)
index = sr.find(".mp4")
if index>=0:
print("the post contains video. skip...")
pass
else:
name_link_img=sr.split("?")[0].split("/")[-1]
output_date = datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")
p = requests.get(sr)
with open(f".\\instagram\\{get_username}\\{output_date}_{name_link_img}", "wb") as img:
img.write(p.content)
print("file saved")
count_img+=1
len_links-=1
print(f"from {posts} posts downloaded {count_img} photos")
except:
print("something wrong!")
finally:
print("task completed")
driver.close()
driver.quit()
if __name__ == "__main__":
main()