I have the following script which i want it to scrapes google images. It clicks on the image first and then clicks on next (>) button to switch to the next image.
It downloads the first image, but when it's turn of the second image then it throws me an error.
Traceback (most recent call last):
File "c:/Users/intel/Desktop/Scrappr/image_scrape.pyw", line 40, in <module>
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'n3VNCb'))).get_attribute("src")
File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
My code :
import requests
import shutil
import time
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe", options=options)
driver.get("https://www.google.com/search?q=mac+beautiful+ui&tbm=isch&ved=2ahUKEwiL3ILMveToAhWGCHIKHVPNAScQ2-cCegQIABAA&oq=mac+beautiful+ui&gs_lcp=CgNpbWcQAzoECAAQQzoCCAA6BQgAEIMBOgYIABAFEB46BggAEAgQHlDPI1iEUWCgU2gAcAB4AIAByAKIAd8dkgEHMC40LjkuM5gBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ei=Q9-TXsuuMoaRyAPTmoe4Ag&bih=657&biw=1360")
driver.find_element_by_class_name("rg_i").click()
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb'))).get_attribute("src")
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
I've tidied up and refactored a bit your code. The final result is capable of grabbing n amount of images for keywords of your choice (see SEARCH_TERMS):
import base64
import os
import requests
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
CHROME_DRIVER_LOCATION = r'C:\Users\intel\Downloads\setups\chromedriver.exe'
SEARCH_TERMS = ['very', 'hot', 'chicks']
TARGET_SAVE_LOCATION = os.path.join(r'c:\test', '_'.join([x.capitalize() for x in SEARCH_TERMS]), r'{}.{}')
if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
def check_if_result_b64(source):
possible_header = source.split(',')[0]
if possible_header.startswith('data') and ';base64' in possible_header:
image_type = possible_header.replace('data:image/', '').replace(';base64', '')
return image_type
return False
def get_driver():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
new_driver.get(f"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X")
return new_driver
driver = get_driver()
first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
first_search_result.click()
right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[#data-query="{' '.join(SEARCH_TERMS)}"]''')))
first_image = right_panel_base.find_elements_by_xpath('//*[#data-noaft="1"]')[0]
magic_class = first_image.get_attribute('class')
image_finder_xp = f'//*[#class="{magic_class}"]'
# initial wait for the first image to be loaded
# this part could be improved but I couldn't find a proper way of doing it
time.sleep(3)
# initial thumbnail for "to_be_loaded image"
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
for i in range(10):
# issue 4: All image elements share the same class. Assuming that you always click "next":
# The last element is the base64 encoded thumbnail version is of the "next image"
# [-2] element is the element currently displayed
target = driver.find_elements_by_xpath(image_finder_xp)[-2]
# you need to wait until image is completely loaded:
# first the base64 encoded thumbnail will be displayed
# so we check if the displayed element src match the cached thumbnail src.
# However sometimes the final result is the base64 content, so wait is capped
# at 5 seconds.
wait_time_start = time.time()
while (target.get_attribute("src") == thumbnail_src) and time.time() < wait_time_start + 5:
time.sleep(0.2)
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
attribute_value = target.get_attribute("src")
print(attribute_value)
# issue 1: if the image is base64, requests get won't work because the src is not an url
is_b64 = check_if_result_b64(attribute_value)
if is_b64:
image_format = is_b64
content = base64.b64decode(attribute_value.split(';base64')[1])
else:
resp = requests.get(attribute_value, stream=True)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
# issue 2: if you 'open' a file, later you have to close it. Use a "with" pattern instead
with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
f.write(content)
# issue 3: this Xpath is bad """//*[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""" if page layout changes, this path breaks instantly
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, "click:trigger")]//*[#viewBox="0 0 24 24"]'
next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
next_arrow.click()
Disclaimer: I doubt that Google allows scraping on Search. You should check out https://www.google.com/robots.txt to find out.
That being said, I think there is a problem in your WebDriverWait method, though I am not sure what exactly it is. Since you already have your driver wait before that with time.sleep, I just tried to find the element directly, and it worked:
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = driver.find_element_by_css_selector("img.n3VNCb").get_attribute("src") # NEW LINE
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
Related
Im currently trying to make a bot for tiktok thats just uploading a Video after one hour, then second hour, etc On multiple accounts at the same time, same video. But im currently stuck at the login Screen, it tells me "too many login attempts. Try again later" everytime selenium tries to login, the weird thing is that it worked 5 days ago and it worked perfectly fine without any errors, but now this happend.
Does anybody have any Idea why this happens?
Here's the code, if there is anything else you need to know feel free to ask:
import os
import time
from selenium_profiles.driver import driver as mydriver
from selenium_profiles.profiles import profiles
from selenium_profiles.scripts.driver_utils import actions
import undetected_chromedriver as uc
from multiprocessing import Process
from openpyxl import Workbook, load_workbook
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
import requests
import pyautogui
import random
wb = load_workbook("backend.xlsx")
ws = wb.active
accountCounter = 1
#These Variables are changeable
accountAmount = 1
hashtags = "#rich #money #luxury #millionaire #business #luxurylifestyle #lifestyle"
upload_cooldown = 30
######
mobile_emulation = {
"deviceName": "Nokia N9"
}
s = Service("chromedriver_bypass.exe")
options = Options()
mydriver = mydriver()
profile = profiles.Android()
os.chdir("Vids")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--profile-dictonary=Default")
options.add_experimental_option("mobileEmulation", mobile_emulation)
#user_agent = "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"
def findPasswords():
passwords = []
for x in range(0, accountAmount):
password = ws["B" + str(x + 2)].value
passwords.append(password)
return passwords
def findUsers():
users = []
for x in range(0, accountAmount):
user = ws["A" + str(x + 2)].value
users.append(user)
return users
def createWebDrivers(accAmount):
webDrivers = []
for x in range(0,accAmount):
#webDriver_x = webdriver.Chrome(service=s,options=options)
webDriver_x = mydriver.start(profile, uc_driver=False)
webDrivers.append(webDriver_x)
return webDrivers
def createWaitDrivers(webDrivers):
waitDrivers = []
webDriverAmount = len(webDrivers)
for x in range(0,webDriverAmount):
waitDriver_x = WebDriverWait(webDrivers[x],7)
waitDrivers.append(waitDriver_x)
return waitDrivers
webDrivers = createWebDrivers(accountAmount)
waitDrivers = createWaitDrivers(webDrivers)
passwords = findPasswords()
users = findUsers()
def login(webDriver, waitDriver, user, password):
#Fixing the Bot detection
webDriver.get("https://www.tiktok.com/login/phone-or-email/email")
#webDriver.get("https://bot.sannysoft.com/")
time.sleep(random.uniform(1,3))
y = input("Click away everything on the screen, if you have done so please write 'y'")
if "y" in y:
# username
userBox = webDriver.find_element(By.XPATH, "/html/body/div[2]/div[1]/div/div[1]/form/div[1]/input")
userBoxLocation = actions.mid_location(userBox)
action = actions.touch_action_chain()
action.pointer_action.move_to_location(userBoxLocation["x"], userBoxLocation["y"])
action.pointer_action.pointer_down()
action.perform()
time.sleep(random.uniform(1,3))
for char in user:
waitDriver.until(EC.presence_of_element_located((By.NAME, "username"))).send_keys(char)
time.sleep(random.uniform(.01, .1))
time.sleep(10)
# password
waitDriver.until(EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/div/div[1]/form/div[2]/div/input"))).click()
time.sleep(random.uniform(1,3))
for char in password:
waitDriver.until(EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/div/div[1]/form/div[2]/div/input"))).send_keys(char)
time.sleep(random.uniform(.1, .2))
time.sleep(15)
waitDriver.until(EC.presence_of_element_located((By.XPATH,"/html/body/div[2]/div[1]/div/div[1]/form/button"))).click()
I tried changing the cdc_ variable in the chromedriver. I tried using undetected_selenium. I tried using selenium-stealth (currently in the code too), changing user agents etc. Also it is important to state that I use an excel worksheet to retrieve account data (via openpyxl)
Here's the website : website
And here's my script :
from selenium import webdriver
import time
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium.webdriver.common.keys import Keys
#chemin du folder ou vous avez placer votre chromedriver
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
#driver = webdriver.Chrome(options=options, executable_path=PATH)
url = 'https://www.booking.com/hotel/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625222475;srpvid=48244b2523010057;type=total;ucfs=1&#tab-main'
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get(url)
driver.maximize_window()
time.sleep(2)
headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
cookie = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
try:
cookie.click()
except:
pass
time.sleep(2)
country = driver.find_element_by_xpath('//*[#class="hp_nav_reviews_link toggle_review track_review_link_zh"]')
country.click()
time.sleep(2)
url2 = driver.current_url
commspos = []
commsneg = []
header = []
notes = []
dates = []
datestostay = []
results = requests.get(url2, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
reviews = soup.find_all('li', class_ = "review_item clearfix")
for review in reviews:
try:
commpos = review.find("p", class_ = "review_pos").text.strip()
except:
commpos = 'NA'
commspos.append(commpos)
try:
commneg = review.find("p", class_ = "review_neg").text.strip()
except:
commneg = 'NA'
commsneg.append(commneg)
head = review.find('div', class_ = 'review_item_header_content').text.strip()
header.append(head)
note = review.find('span', class_ = 'review-score-badge').text.strip()
notes.append(note)
date = review.find('p', class_ = 'review_item_date').text[23:].strip()
dates.append(date)
try:
datestay = review.find('p', class_ = 'review_staydate').text[20:].strip()
datestostay.append(datestay)
except:
datestostay.append('NaN')
data = pd.DataFrame({
'commspos' : commspos,
'commsneg' : commsneg,
'headers' : header,
'notes' : notes,
'dates' : dates,
'datestostay' : datestostay,
})
data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')
My script goes here :
But my csv file as output is empty. I assume it has to do with some kind of javascript. I already encounter that, the script goes to the subpage but isn't really into the subpage, hence doesn't have access to the html part and give nothing in the output.
That sub-page that you want loads the data from this URL.
https://www.booking.com/hotelfeaturedreviews/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=22417257c7da25395d270bcc7c6ec2e8;dest_id=-1456928;dest_type=city;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625222475;srpvid=48244b2523010057;type=total;ucfs=1&&_=1625476042625
You can easily scrape this page and extract the data you need.
Using link;
image on flickr, requests only returns html to the comment:
`<!-- rendered with love by pprd1-node580-lh1.manhattan.bf1.yahoo.com -->`
(see image below for html).
I would like to access the links within in the img elements 3 div elements below so would appreciate any input.
from bs4 import BeautifulSoup
import logging
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import shutil
import sys
import time
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - \
%(levelname)s - %(message)s")
def flickr_images():
try:
search_term, number_images = sys.argv[1:]
num_req_images = int(number_images)
except ValueError:
print("Something went wrong. Command line input must be of \
format: 'filename searchterm numberimages'")
return
# navigate to search results page
driver = webdriver.Firefox()
# poll DOM for max 10 secs if element not immediately available
driver.implicitly_wait(10)
driver.get("https://www.flickr.com/search/?text=" + search_term)
driver.maximize_window()
# 0sec wait = 25images, 1sec = 48, 3+sec = 98
time.sleep(3)
image_link_elems = driver.find_elements_by_class_name("overlay")
# Incase requested is > found
num_images_tosave = min(req_images, len(image_link_elems))
image_elems_tosave = image_link_elems[:num_images_tosave]
print("{} images found.".format(num_images_tosave))
logging.info("Length photos: {}".format(len(image_link_elems)))
# extract image src's from found elements
src_links = []
image_links = [link.get_attribute("href") for link in image_elems_tosave]
for image_link in image_links:
res = requests.get(image_link)
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, "html.parser")
src_elem = soup.select(".zoom-small")
HTML image:
I am trying to retrieve all possible year, model and make from [https://www.osram-americas.com/en-us/applications/automotive-lighting-systems/Pages/lrgmain.aspx]
from selenium import webdriver
import time
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.osram-americas.com/en-us/applications/automotive-lighting-systems/Pages/lrgmain.aspx")
# Switch to new window opened
driver.switch_to.window(driver.window_handles[-1])
# Close the new window
driver.close()
# Switch back to original browser (first window)
driver.switch_to.window(driver.window_handles[0])
el = driver.find_element_by_css_selector("div#fldYear a.sbToggle")
el.click()
select = Select(el)
select.select_by_visible_text("2016")
However if I try to select year 2016. It is giving error :
Select only works on <select> elements, not on <a>
You don't actually need selenium and automate any visual interactions to get the year+make+model data from the page and can approach the problem with requests only making appropriate GET requests:
# -*- coding: utf-8 -*-
from collections import defaultdict
from pprint import pprint
import requests
year = 2016
d = defaultdict(lambda: defaultdict(list))
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
session.get("https://www.osram-americas.com/en-us/applications/automotive-lighting-systems/Pages/lrgmain.aspx")
while True:
response = session.get("https://www.osram-americas.com/_Layouts/Sylvania.Web.LRGHandler/LRGHandler.ashx",
params={"rt": "fetchmake", "year": str(year)})
data = response.json()
if not data: # break if no makes in a year
break
for make in data:
response = session.get("https://www.osram-americas.com/_Layouts/Sylvania.Web.LRGHandler/LRGHandler.ashx",
params={"rt": "fetchmodel", "year": str(year), "make": make["Id"]})
for model in response.json():
d[year][make["Value"]].append(model["Value"])
year -= 1
pprint(dict(d))
The problem with your current approach is that you are finding the a element and trying to use it as a select element. Select class will only work with select elements.
Note that in this case, it's easier to get to the invisible select element and get the years from its options directly:
options = [option.get_attribute("innerText") for option in driver.find_elements_by_css_selector("select#ddlYear option")[1:]]
print(options)
The [1:] slice here is to skip the very first Select Year element.
Complete working code:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.osram-americas.com/en-us/applications/automotive-lighting-systems/Pages/lrgmain.aspx")
wait = WebDriverWait(driver, 10)
toggle = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#fldYear a.sbToggle")))
toggle.click()
options = [option.get_attribute("innerText") for option in driver.find_elements_by_css_selector("select#ddlYear option")[1:]]
print(options)
Prints:
[u'2016', u'2015', u'2014', u'2013', u'2012', u'2011', u'2010', u'2009', u'2008', u'2007', u'2006', u'2005', u'2004', u'2003', u'2002', u'2001', u'2000', u'1999', u'1998', u'1997', u'1996', u'1995', u'1994', u'1993', u'1992', u'1991', u'1990', u'1989', u'1988', u'1987', u'1986', u'1985', u'1984', u'1983', u'1982', u'1981', u'1980', u'1979', u'1978', u'1977', u'1976', u'1975', u'1974', u'1973', u'1972', u'1971', u'1970', u'1969', u'1968', u'1967', u'1966', u'1965', u'1964', u'1963', u'1962', u'1961', u'1960', u'1959', u'1958', u'1957', u'1956', u'1955']
This is the thing, I'm using PhantomJS and Selenium in Python to render pages, this is the code:
import sys, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
path_to_chromedriver = 'C:\\..\\chromedriver'
section = sys.argv[1]
path = sys.argv[2]
links = sys.argv[3]
listOfLinks = []
file = open(links, 'r')
for link in file:
listOfLinks.append(link)
dr = webdriver.Chrome(executable_path = path_to_chromedriver)
cont = 0
for link in listOfLinks:
try:
dr.get(link)
# Wait.
element = WebDriverWait(dr, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "_img-zoom"))
)
time.sleep(1)
htmlPath = path + section + "_" + str(cont) + ".html"
# Write HTML.
file = open(htmlPath, 'w')
file.write(dr.page_source)
file.close()
cont = cont + 1
except:
print("Exception")
dr.quit()
This code creates a HTML of the links received as parameter.
This file is parsed by Jsoup in Java:
Document document = Jsoup.parse( file, "UTF-8" );
However, special characters as '€', 'á', 'é', 'í', etc are not decoded properly and they're being replaced by '?'. How can I solve this?
Solution found by Uzochi
Try Document document = Jsoup.parse( file, "ISO-8859-1" );