Hi I'm crawling a google image using selenium. But it didn't work well. How can I get this code to work? My code is like below.
Previously, I used google_images_download and suddenly got stuck. So I'm looking for a new way and I hope someone can help Thank you
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import os
import urllib.request as urllib2
import argparse
searchterm = 'spider' # will also be the name of the folder
url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"
# NEED TO DOWNLOAD CHROMEDRIVER, insert path to chromedriver inside parentheses in following line
browser = webdriver.Chrome('C:\Python27\Scripts\chromedriver')
browser.get(url)
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for _ in range(500):
browser.execute_script("window.scrollBy(0,10000)")
for x in browser.find_elements_by_xpath('//div[contains(#class,"rg_meta")]'):
counter = counter + 1
print("Total Count:", counter)
print("Succsessful Count:", succounter)
print("URL:",json.loads(x.get_attribute('innerHTML'))["ou"])
img = json.loads(x.get_attribute('innerHTML'))["ou"]
imgtype = json.loads(x.get_attribute('innerHTML'))["ity"]
try:
req = urllib2.Request(img, headers={'User-Agent': header})
raw_img = urllib2.urlopen(req).read()
File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb")
File.write(raw_img)
File.close()
succounter = succounter + 1
except:
print("can't get img")
print (succounter, "pictures succesfully downloaded")
browser.close()
I decided to crawl images from other websites other than Google Images.
I also faced problem with crawling images from google like your method as using rg_meta
google image search result webpage source code has been changed and they don't give rg_meta anymore since the beginning of 2020.
rg_meta tag also changed to randomly string like
rg_X XXXXXX XXXXXX
I think Google set about to ban crawling bots and leading to use Google Custom Search APIs.
Related
I need to download a lot of images for a personal project and I wanted to use a script to download them automatically from Google.
I have found a script on Stackoverflow which works perfectly: I am able to open Chrome, download the first image but then the script breaks when trying to move to the second image.
Here is the code I have found:
import base64
import os
import requests
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
CHROME_DRIVER_LOCATION = r'C:/pathtochromedriver/chromedriver.exe'
SEARCH_TERMS = ['cats', 'dogs']
TARGET_SAVE_LOCATION = os.path.join(r'D:\\Pictures\\test', '_'.join([x.capitalize() for x in SEARCH_TERMS]), r'{}.{}')
if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
def check_if_result_b64(source):
possible_header = source.split(',')[0]
if possible_header.startswith('data') and ';base64' in possible_header:
image_type = possible_header.replace('data:image/', '').replace(';base64', '')
return image_type
return False
def get_driver():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \\
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument(\"--headless\")
options.add_argument(f'user-agent={user_agent}')
options.add_argument(\"--disable-web-security\")
options.add_argument(\"--allow-running-insecure-content\")
options.add_argument(\"--allow-cross-origin-auth-prompt\")
new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
new_driver.get(f\"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X\")
return new_driver
driver = get_driver()
first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
first_search_result.click()
right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[#data-query=\"{' '.join(SEARCH_TERMS)}\"]''')))
first_image = right_panel_base.find_elements_by_xpath('//*[#data-noaft=\"1\"]')[0]
magic_class = first_image.get_attribute('class')
image_finder_xp = f'//*[#class=\"{magic_class}\"]'
# initial wait for the first image to be loaded
# this part could be improved but I couldn't find a proper way of doing it
time.sleep(3)
# initial thumbnail for \"to_be_loaded image\"
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute(\"src\")
for i in range(10):
# issue 4: All image elements share the same class. Assuming that you always click \"next\":
# The last element is the base64 encoded thumbnail version is of the \"next image\"
# [-2] element is the element currently displayed
target = driver.find_elements_by_xpath(image_finder_xp)[-2]
# you need to wait until image is completely loaded:
# first the base64 encoded thumbnail will be displayed
# so we check if the displayed element src match the cached thumbnail src.
# However sometimes the final result is the base64 content, so wait is capped
# at 5 seconds.
wait_time_start = time.time()
while (target.get_attribute(\"src\") == thumbnail_src) and time.time() < wait_time_start + 5:
time.sleep(0.2)
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute(\"src\")
attribute_value = target.get_attribute(\"src\")
print(attribute_value)
# issue 1: if the image is base64, requests get won't work because the src is not an url
is_b64 = check_if_result_b64(attribute_value)
if is_b64:
image_format = is_b64
content = base64.b64decode(attribute_value.split(';base64')[1])
else:
resp = requests.get(attribute_value, stream=True)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
# issue 2: if you 'open' a file, later you have to close it. Use a \"with\" pattern instead
with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
f.write(content)
# issue 3: this Xpath is bad \"\"\"//*[#id=\"Sva75c\"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div\"\"\" if page layout changes, this path breaks instantly
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, \"click:trigger\")]//*[#viewBox=\"0 0 24 24\"]'
print(svg_arrows_xpath)
print(driver.find_elements_by_xpath(str(svg_arrows_xpath)))
next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
next_arrow.click()
I think that the problem is caused by the last line "driver.find_elements_by_xpath(svg_arrows_xpath)[-3]".
I think that the the code has changed and it is not pointing to the right element anymore.
Unfortunately, I have 0 knowledge of Selenium.
Could you please tell me how to modify it to point to the right element?
Could you guys tell me how this works and how I could have solved it?
Thank you in advance!
Pages can be redesigned and maybe this script was created for older Google.
But also Selenium is changing and older methods may not work in new versions.
I can find elements if I skip last part
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, "trigger")]'
Element has also problem with function .click()
because Selenium see this element as not interactive and it can't click it.
But it can be clicked using JavaScript code.
driver.execute_script('arguments[0].click()', next_arrow)
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, "trigger")]' #//*[#viewBox="0 0 24 24"]'
#print(svg_arrows_xpath)
found = driver.find_elements_by_xpath(svg_arrows_xpath)
print('len(found):', len(found))
next_arrow = found[-3]
#next_arrow.click()
driver.execute_script('arguments[0].click()', next_arrow)
everyone.
I am working on a python project with selenium to scrape data.
But there is one problem, I have to scrape the data every 5mins.
So I run chrome driver with selenium, the problem is selenium scrape speed is very slow.
If I run this project, It takes at least 30mins. I can't get data every 5mins.
If you have experience in this field, please help me.
If you can give me other ways(for example beautiful soap), I will be very happy.
Note: This site that I want to get data is rendering using javascript.
This is my source code. I am testing it.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
driver = webdriver.Chrome()
driver.set_window_size(800, 600)
tickerNames = []
finvizUrl = "https://finviz.com/screener.ashx?v=111&f=exch_nasd,geo_usa,sh_float_u10,sh_price_u10,sh_relvol_o2"
nasdaqUrl = "https://www.nasdaq.com/market-activity/stocks/"
tickerPrice = []
def openPage(url):
driver.get(url)
def exitDrive():
driver.quit()
def getTickers():
tickers = driver.find_elements_by_class_name('screener-link-primary')
for i in range(len(tickers)):
tickerNames.append(tickers[i].text)
return tickerNames
def comparePrice(tickers):
for i in range(len(tickers)):
openPage(nasdaqUrl+tickers[i])
tickerPrice[i] = driver.find_element_by_class_name('symbol-page-header__pricing-price').text
return tickerPrice
openPage(finvizUrl)
comparePrice(getTickers())
# getTickers()
print(comparePrice())
There seems to be an API on the nasdaq site that you can query (found using network tools), so there isn't really any need to use selenium for this. Here is an example that gets the data using requests
import requests
import lxml.html
import time
FINVIZ_URL = "https://finviz.com/screener.ashx?v=111&f=exch_nasd,geo_usa,sh_float_u10,sh_price_u10,sh_relvol_o2"
NASDAQ_URL = "https://api.nasdaq.com/api/quote/{}/summary?assetclass=stocks"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
}
session = requests.Session()
session.headers.update(headers)
r = session.get(FINVIZ_URL)
# Get data using lxml xpath but use whatever you want
x = lxml.html.fromstring(r.text)
stocks = x.xpath("//*[#class='screener-link-primary']/text()")
for stock in stocks:
data = session.get(NASDAQ_URL.format(stock))
print(f"INFO for {stock}")
print(data.json()) # This might have the data you want
# Sleep in case there is a rate limit (may not be needed)
time.sleep(5)
I'm trying my hand at some python code, and am having a hell of a time with Selenium. Any help you could offer would be super appreciated. Long story short, I'm trying to pull the average rating of a given movie from Letterboxd.com. For example:
https://letterboxd.com/film/the-dark-knight/
The value I'm looking for is the average rating to 2 decimal places, which you can see if you mouseover the rating that's displayed on the page:
Average Rating 4.43 displayed on mousover
In this case, the average rating is 4.43, and that's the number I'm trying to retrieve.
So far, I've managed to successfully grab the 1 decimal place version using driver.find_elements_by_class_name('average-rating')
In this case, that returns "4.4". But I need "4.43."
I can see the correct value in the developer tools. It appears twice. Once here:
<span class="average-rating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
4.4
And again in what appears to be metadata:
<meta name="twitter:data2" content="4.43 out of 5">
Any suggestions on how I can grab that value correctly? Thanks so much!
Cheers,
Ari
There is another way you might wanna think of using to get the rating along with the counting of users voted for that rating. Given that they all are available in the page source within some script tag.
import re
import json
import requests
URL = 'https://letterboxd.com/film/the-dark-knight/'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
r = s.get(URL)
data = json.loads(re.findall(r"CDATA[^{]+(.*)",r.text)[0])
rating = data['aggregateRating']['ratingValue']
user_voted = data['aggregateRating']['ratingCount']
print(rating,user_voted)
Please find the code and let me know if you don't understand anything. To Hover over Main Rating you should use actionchains.
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
import time
driver = webdriver.Chrome()
driver.get("https://letterboxd.com/film/the-dark-knight/")
wait = WebDriverWait(driver, 20)
time.sleep(10)
Main_Rating = driver.find_element_by_class_name('average-rating')
print("Main Rating is :- " + Main_Rating.text)
time.sleep(5)
ActionChains(driver).move_to_element(Main_Rating).perform()
More_Rating_info = driver.find_element_by_xpath('//div[#class="twipsy-inner"]').text
More_Message = More_Rating_info.split()
print("More Rating :- " + More_Message[3])
Note - If this resolves your problem then please mark it as answer.
Try below code using beautiful soup and requests:
Benefit of using Beautiful soup and requests:
Fast in terms of getting result.
Less error.
More accessibility to html tags.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def extract_avg_rating():
movie_name = 'the-dark-knight'
url = 'https://letterboxd.com/film/' + movie_name
session = requests.Session()
url_response = session.get(url,verify=False)
soup = bs(url_response.text, 'html.parser')
extracted_meta = soup.find_all('meta')[19]
extracted_rating = extracted_meta.attrs['content'].split(' ')[0]
print('Movie ' + movie_name + ' rating ' + extracted_rating)
extract_avg_rating()
In the above code movie_name parameter you can put any film name for ex: lucky-grandma and it will give you the accurate rating. Code is dynamic and help you in extracting other movies ratings and other information despite of only one thing as per your requirement.
Here is my code I want to download the image search result of the Google.
This is a link of Google image search
For example I want to get the first result of it.
I use the code below to click the first image and it works.
driver.find_elements_by_xpath("//*[#id='rg_s']/div[1]/a/img")[0].click()
And then I get the result like this:
I want to download the origin image. So I use
img_urllist = driver.find_elements_by_xpath("//*[#id='irc_cc']/div[2]/div[1]/div[2]/div[2]/a/img")
But I get a NoneType thing here. I don't know why.The xpath is correct. Why I get a NoneType.
This is the error msg I get :
selenium.common.exceptions.ElementNotInteractableException: Message:
from selenium import webdriver
from binascii import a2b_base64
from selenium.webdriver.common.keys import Keys
import os
import json
import urllib2
import sys
import time
import re
# adding path to geckodriver to the OS environment variable
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = "dataset/"
def main():
searchtext = "wallpaper"
num_requested = 10
if not os.path.exists(download_path + searchtext.replace(" ", "_")):
os.makedirs(download_path + searchtext.replace(" ", "_"))
url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
driver = webdriver.Firefox()
driver.get(url)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
extensions = [ "jpg", "jpeg", "png", "gif" ]
img_count = 0
downloaded_img_count = 0
driver.find_elements_by_xpath("//*[#id='rg_s']/div[1]/a/img")[0].click()
img_urllist = driver.find_elements_by_xpath("//*[#id='irc_cc']/div[2]/div[1]/div[2]/div[2]/a/img")
print img_urllist
img_urllist[0].click()
print "Total downloaded: ", downloaded_img_count, "/", img_count
driver.quit()
if __name__ == "__main__":
main()
Try using below xpath. I tried inspecting the element and found xpath different than yours
img_urllist = driver.find_elements_by_xpath("//*[#id='irc_cc']/div[2]/div[1]/div[2]/div[3]/a/img")
i'm having a very tough time searching google image search with python. I need to do it using only standard python libraries (so urllib, urllib2, json, ..)
Can somebody please help? Assume the image is jpeg.jpg and is in same folder I'm running python from.
I've tried a hundred different code versions, using headers, user-agent, base64 encoding, different urls (images.google.com, http://images.google.com/searchbyimage?hl=en&biw=1060&bih=766&gbv=2&site=search&image_url={{URL To your image}}&sa=X&ei=H6RaTtb5JcTeiALlmPi2CQ&ved=0CDsQ9Q8, etc....)
Nothing works, it's always an error, 404, 401 or broken pipe :(
Please show me some python script that will actually seach google images with my own image as the search data ('jpeg.jpg' stored on my computer/device)
Thank you for whomever can solve this,
Dave:)
I use the following code in Python to search for Google images and download the images to my computer:
import os
import sys
import time
from urllib import FancyURLopener
import urllib2
import simplejson
# Define search term
searchTerm = "hello world"
# Replace spaces ' ' in search term for '%20' in order to comply with request
searchTerm = searchTerm.replace(' ','%20')
# Start FancyURLopener with defined version
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
# Set count to 0
count= 0
for i in range(0,10):
# Notice that the start changes for each iteration in order to request a new set of images for each loop
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0&q='+searchTerm+'&start='+str(i*4)+'&userip=MyIP')
print url
request = urllib2.Request(url, None, {'Referer': 'testing'})
response = urllib2.urlopen(request)
# Get results using JSON
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
# Iterate for each result and get unescaped url
for myUrl in dataInfo:
count = count + 1
print myUrl['unescapedUrl']
myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
# Sleep for one second to prevent IP blocking from Google
time.sleep(1)
You can also find very useful information here.
The Google Image Search API is deprecated, we use google search to download the images using REgex and Beautiful soup
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)))
image_type = "Action"
# you can change the query for the image here
query = "Terminator 3 Movie"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/searches_sm=122&source=lnms&tbm=isch&sa=X&ei=4r_cVID3NYayoQTb4ICQBA&ved=0CAgQ_AUoAQ&biw=1242&bih=619&q="+query
print url
header = {'User-Agent': 'Mozilla/5.0'}
soup = get_soup(url,header)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})]
#print images
for img in images:
raw_img = urllib2.urlopen(img).read()
#add the directory for your image here
DIR="C:\Users\hp\Pictures\\valentines\\"
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print cntr
f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb')
f.write(raw_img)
f.close()