-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
92 lines (74 loc) · 3.05 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from time import sleep
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import asyncio
import openpyxl
import threading
workbook = openpyxl.load_workbook('data.xlsm')
sheet = workbook.active
async def scrape_image(item, semaphore):
async with semaphore:
options = Options()
# options.add_argument("--headless")
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')
driver = webdriver.Chrome(options=options)
file_path = 'path.txt'
with open(file_path, 'r') as f:
path = f.read()
input_path = rf'{path}'
site_url = "https://www.astrobin.com/search/?q="
complete_path = os.path.join(input_path, 'images')
if not os.path.exists(complete_path):
os.mkdir(complete_path)
driver.get(site_url)
search_box = driver.find_element(By.XPATH, "//input[@id='q']")
search_box.send_keys(item)
await asyncio.sleep(1)
search_box.send_keys(Keys.ENTER)
await asyncio.sleep(5)
image_urls = driver.find_elements(By.XPATH, "//figcaption/a")
image_urls_array = []
for urls in image_urls:
image_urls_array.append(urls.get_attribute('href'))
print(image_urls)
print(len(image_urls_array))
for i, url in enumerate(image_urls_array):
driver.get(url)
while True:
image_url = driver.find_element(By.XPATH, "//figure//img").get_attribute('src')
if image_url.split('.')[-1] == "jpg" or image_url.split('.')[-1] == "png":
print(image_url)
break
else:
pass
await asyncio.sleep(1)
filename = os.path.join(complete_path, item + str(i + 1) + '.' + image_url.split('.')[-1])
response = requests.get(image_url)
if response.status_code == 200:
with open(filename, "wb") as f:
f.write(response.content)
print(f"Image saved successfully!{i + 1}")
else:
print("Failed to download image")
driver.quit()
async def main():
MAX_CONCURRENT_THREADS = 4
MIN_CONCURRENT_THREADS = 2
semaphore = asyncio.Semaphore(MAX_CONCURRENT_THREADS)
tasks = []
for cell in sheet.iter_cols(min_row=1, max_row=sheet.max_row, min_col=1, max_col=1, values_only=True):
for item in cell:
task = asyncio.create_task(scrape_image(item, semaphore))
tasks.append(task)
while len(tasks) >= MAX_CONCURRENT_THREADS:
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
tasks = [t for t in tasks if not t.done()]
if tasks:
await asyncio.wait(tasks)
if __name__ == "__main__":
asyncio.run(main())