Why I use selenium to copy the xpath but get nothing? - python

Here is my code I want to download the image search result of the Google.
This is a link of Google image search
For example I want to get the first result of it.
I use the code below to click the first image and it works.
driver.find_elements_by_xpath("//*[#id='rg_s']/div[1]/a/img")[0].click()
And then I get the result like this:
I want to download the origin image. So I use
img_urllist = driver.find_elements_by_xpath("//*[#id='irc_cc']/div[2]/div[1]/div[2]/div[2]/a/img")
But I get a NoneType thing here. I don't know why.The xpath is correct. Why I get a NoneType.
This is the error msg I get :
selenium.common.exceptions.ElementNotInteractableException: Message:
from selenium import webdriver
from binascii import a2b_base64
from selenium.webdriver.common.keys import Keys
import os
import json
import urllib2
import sys
import time
import re
# adding path to geckodriver to the OS environment variable
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = "dataset/"
def main():
searchtext = "wallpaper"
num_requested = 10
if not os.path.exists(download_path + searchtext.replace(" ", "_")):
os.makedirs(download_path + searchtext.replace(" ", "_"))
url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
driver = webdriver.Firefox()
driver.get(url)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
extensions = [ "jpg", "jpeg", "png", "gif" ]
img_count = 0
downloaded_img_count = 0
driver.find_elements_by_xpath("//*[#id='rg_s']/div[1]/a/img")[0].click()
img_urllist = driver.find_elements_by_xpath("//*[#id='irc_cc']/div[2]/div[1]/div[2]/div[2]/a/img")
print img_urllist
img_urllist[0].click()
print "Total downloaded: ", downloaded_img_count, "/", img_count
driver.quit()
if __name__ == "__main__":
main()

Try using below xpath. I tried inspecting the element and found xpath different than yours
img_urllist = driver.find_elements_by_xpath("//*[#id='irc_cc']/div[2]/div[1]/div[2]/div[3]/a/img")

Related

Python library newspaper is not returning the published date

I am using newspaper python library to extract some data from new stories. The problem is that I am not getting this data for some URLs. These URLs work fine. They all return 200. I am doing this for a very large dataset but this is one of the URLs for which the date extraction did not work. The code works for some links and not others (from the same domain) so I know that the problem isn't something like my IP being blocked for too many requests. I tried it on just one URL and getting the same result (no data).
import os
import sys
from newspaper import Article
def split(link):
try:
story = Article(link)
story.download()
story.parse()
date_time = str(story.publish_date)
split_date = date_time.split()
date = split_date[0]
if date != "None":
print(date)
except:
print("This URL did not return a published date. Try a different URL.")
print(link)
if __name__ == "__main__":
link = "https://www.aljazeera.com/program/featured-documentaries/2020/12/29/lords-of-water-episode-one"
split(link)
I am getting this output:
This URL did not return a published date. Try a different URL.
https://www.aljazeera.com/program/featured-documentaries/2020/12/29/lords-of-water-episode-one
Please check the link, I checked the link and it's unavailable now.
If link is unavailable, the code will not be work.
Try adding some error handling to your code to catch URLs that return a 404, such as this one: https://www.aljazeera.com/program/featured-documentaries/2020/12/29/lords-of-water-episode-one
from newspaper import Config
from newspaper import Article
from newspaper.article import ArticleException
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.aljazeera.com/program/featured-documentaries/2020/12/29/lords-of-water-episode-one'
try:
article = Article(base_url, config=config)
article.download()
article.parse()
except ArticleException as error:
print(error)
Output:
Article `download()` failed with 404 Client Error: Not Found for url: https://www.aljazeera.com/program/featured-documentaries/2020/12/29/lords-of-water-episode-one on URL https://www.aljazeera.com/program/featured-documentaries/2020/12/29/lords-of-water-episode-one
Newspaper3k has multiple ways to extract the publish dates from articles. Take a look at this document that I wrote on how to use Newspaper3k.
Here is an example for this valid URL https://www.aljazeera.com/program/featured-documentaries/2022/3/31/lords-of-water that extracts data elements from the page's meta tags.
from newspaper import Config
from newspaper import Article
from newspaper.article import ArticleException
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.aljazeera.com/program/featured-documentaries/2022/3/31/lords-of-water'
try:
article = Article(base_url, config=config)
article.download()
article.parse()
article_meta_data = article.meta_data
article_title = [value for (key, value) in article_meta_data.items() if key == 'pageTitle']
print(article_title)
article_published_date = str([value for (key, value) in article_meta_data.items() if key == 'publishedDate'])
print(article_published_date)
article_description = [value for (key, value) in article_meta_data.items() if key == 'description']
print(article_description)
except ArticleException as error:
print(error)
Output
['Lords of Water']
['2022-03-31T06:08:59']
['Is water the new oil? We expose the financialisation of water.']

Webscraping Google Images with Python and Selenium:code breaks when trying to download multiple images

I need to download a lot of images for a personal project and I wanted to use a script to download them automatically from Google.
I have found a script on Stackoverflow which works perfectly: I am able to open Chrome, download the first image but then the script breaks when trying to move to the second image.
Here is the code I have found:
import base64
import os
import requests
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
CHROME_DRIVER_LOCATION = r'C:/pathtochromedriver/chromedriver.exe'
SEARCH_TERMS = ['cats', 'dogs']
TARGET_SAVE_LOCATION = os.path.join(r'D:\\Pictures\\test', '_'.join([x.capitalize() for x in SEARCH_TERMS]), r'{}.{}')
if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
def check_if_result_b64(source):
possible_header = source.split(',')[0]
if possible_header.startswith('data') and ';base64' in possible_header:
image_type = possible_header.replace('data:image/', '').replace(';base64', '')
return image_type
return False
def get_driver():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \\
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument(\"--headless\")
options.add_argument(f'user-agent={user_agent}')
options.add_argument(\"--disable-web-security\")
options.add_argument(\"--allow-running-insecure-content\")
options.add_argument(\"--allow-cross-origin-auth-prompt\")
new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
new_driver.get(f\"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X\")
return new_driver
driver = get_driver()
first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
first_search_result.click()
right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[#data-query=\"{' '.join(SEARCH_TERMS)}\"]''')))
first_image = right_panel_base.find_elements_by_xpath('//*[#data-noaft=\"1\"]')[0]
magic_class = first_image.get_attribute('class')
image_finder_xp = f'//*[#class=\"{magic_class}\"]'
# initial wait for the first image to be loaded
# this part could be improved but I couldn't find a proper way of doing it
time.sleep(3)
# initial thumbnail for \"to_be_loaded image\"
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute(\"src\")
for i in range(10):
# issue 4: All image elements share the same class. Assuming that you always click \"next\":
# The last element is the base64 encoded thumbnail version is of the \"next image\"
# [-2] element is the element currently displayed
target = driver.find_elements_by_xpath(image_finder_xp)[-2]
# you need to wait until image is completely loaded:
# first the base64 encoded thumbnail will be displayed
# so we check if the displayed element src match the cached thumbnail src.
# However sometimes the final result is the base64 content, so wait is capped
# at 5 seconds.
wait_time_start = time.time()
while (target.get_attribute(\"src\") == thumbnail_src) and time.time() < wait_time_start + 5:
time.sleep(0.2)
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute(\"src\")
attribute_value = target.get_attribute(\"src\")
print(attribute_value)
# issue 1: if the image is base64, requests get won't work because the src is not an url
is_b64 = check_if_result_b64(attribute_value)
if is_b64:
image_format = is_b64
content = base64.b64decode(attribute_value.split(';base64')[1])
else:
resp = requests.get(attribute_value, stream=True)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
# issue 2: if you 'open' a file, later you have to close it. Use a \"with\" pattern instead
with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
f.write(content)
# issue 3: this Xpath is bad \"\"\"//*[#id=\"Sva75c\"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div\"\"\" if page layout changes, this path breaks instantly
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, \"click:trigger\")]//*[#viewBox=\"0 0 24 24\"]'
print(svg_arrows_xpath)
print(driver.find_elements_by_xpath(str(svg_arrows_xpath)))
next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
next_arrow.click()
I think that the problem is caused by the last line "driver.find_elements_by_xpath(svg_arrows_xpath)[-3]".
I think that the the code has changed and it is not pointing to the right element anymore.
Unfortunately, I have 0 knowledge of Selenium.
Could you please tell me how to modify it to point to the right element?
Could you guys tell me how this works and how I could have solved it?
Thank you in advance!
Pages can be redesigned and maybe this script was created for older Google.
But also Selenium is changing and older methods may not work in new versions.
I can find elements if I skip last part
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, "trigger")]'
Element has also problem with function .click()
because Selenium see this element as not interactive and it can't click it.
But it can be clicked using JavaScript code.
driver.execute_script('arguments[0].click()', next_arrow)
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, "trigger")]' #//*[#viewBox="0 0 24 24"]'
#print(svg_arrows_xpath)
found = driver.find_elements_by_xpath(svg_arrows_xpath)
print('len(found):', len(found))
next_arrow = found[-3]
#next_arrow.click()
driver.execute_script('arguments[0].click()', next_arrow)

How can I get fast data in python?

everyone.
I am working on a python project with selenium to scrape data.
But there is one problem, I have to scrape the data every 5mins.
So I run chrome driver with selenium, the problem is selenium scrape speed is very slow.
If I run this project, It takes at least 30mins. I can't get data every 5mins.
If you have experience in this field, please help me.
If you can give me other ways(for example beautiful soap), I will be very happy.
Note: This site that I want to get data is rendering using javascript.
This is my source code. I am testing it.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
driver = webdriver.Chrome()
driver.set_window_size(800, 600)
tickerNames = []
finvizUrl = "https://finviz.com/screener.ashx?v=111&f=exch_nasd,geo_usa,sh_float_u10,sh_price_u10,sh_relvol_o2"
nasdaqUrl = "https://www.nasdaq.com/market-activity/stocks/"
tickerPrice = []
def openPage(url):
driver.get(url)
def exitDrive():
driver.quit()
def getTickers():
tickers = driver.find_elements_by_class_name('screener-link-primary')
for i in range(len(tickers)):
tickerNames.append(tickers[i].text)
return tickerNames
def comparePrice(tickers):
for i in range(len(tickers)):
openPage(nasdaqUrl+tickers[i])
tickerPrice[i] = driver.find_element_by_class_name('symbol-page-header__pricing-price').text
return tickerPrice
openPage(finvizUrl)
comparePrice(getTickers())
# getTickers()
print(comparePrice())
There seems to be an API on the nasdaq site that you can query (found using network tools), so there isn't really any need to use selenium for this. Here is an example that gets the data using requests
import requests
import lxml.html
import time
FINVIZ_URL = "https://finviz.com/screener.ashx?v=111&f=exch_nasd,geo_usa,sh_float_u10,sh_price_u10,sh_relvol_o2"
NASDAQ_URL = "https://api.nasdaq.com/api/quote/{}/summary?assetclass=stocks"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
}
session = requests.Session()
session.headers.update(headers)
r = session.get(FINVIZ_URL)
# Get data using lxml xpath but use whatever you want
x = lxml.html.fromstring(r.text)
stocks = x.xpath("//*[#class='screener-link-primary']/text()")
for stock in stocks:
data = session.get(NASDAQ_URL.format(stock))
print(f"INFO for {stock}")
print(data.json()) # This might have the data you want
# Sleep in case there is a rate limit (may not be needed)
time.sleep(5)

I can't download google images using python selenium

Hi I'm crawling a google image using selenium. But it didn't work well. How can I get this code to work? My code is like below.
Previously, I used google_images_download and suddenly got stuck. So I'm looking for a new way and I hope someone can help Thank you
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import os
import urllib.request as urllib2
import argparse
searchterm = 'spider' # will also be the name of the folder
url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"
# NEED TO DOWNLOAD CHROMEDRIVER, insert path to chromedriver inside parentheses in following line
browser = webdriver.Chrome('C:\Python27\Scripts\chromedriver')
browser.get(url)
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for _ in range(500):
browser.execute_script("window.scrollBy(0,10000)")
for x in browser.find_elements_by_xpath('//div[contains(#class,"rg_meta")]'):
counter = counter + 1
print("Total Count:", counter)
print("Succsessful Count:", succounter)
print("URL:",json.loads(x.get_attribute('innerHTML'))["ou"])
img = json.loads(x.get_attribute('innerHTML'))["ou"]
imgtype = json.loads(x.get_attribute('innerHTML'))["ity"]
try:
req = urllib2.Request(img, headers={'User-Agent': header})
raw_img = urllib2.urlopen(req).read()
File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb")
File.write(raw_img)
File.close()
succounter = succounter + 1
except:
print("can't get img")
print (succounter, "pictures succesfully downloaded")
browser.close()
I decided to crawl images from other websites other than Google Images.
I also faced problem with crawling images from google like your method as using rg_meta
google image search result webpage source code has been changed and they don't give rg_meta anymore since the beginning of 2020.
rg_meta tag also changed to randomly string like
rg_X XXXXXX XXXXXX
I think Google set about to ban crawling bots and leading to use Google Custom Search APIs.

python search with image google images

i'm having a very tough time searching google image search with python. I need to do it using only standard python libraries (so urllib, urllib2, json, ..)
Can somebody please help? Assume the image is jpeg.jpg and is in same folder I'm running python from.
I've tried a hundred different code versions, using headers, user-agent, base64 encoding, different urls (images.google.com, http://images.google.com/searchbyimage?hl=en&biw=1060&bih=766&gbv=2&site=search&image_url={{URL To your image}}&sa=X&ei=H6RaTtb5JcTeiALlmPi2CQ&ved=0CDsQ9Q8, etc....)
Nothing works, it's always an error, 404, 401 or broken pipe :(
Please show me some python script that will actually seach google images with my own image as the search data ('jpeg.jpg' stored on my computer/device)
Thank you for whomever can solve this,
Dave:)
I use the following code in Python to search for Google images and download the images to my computer:
import os
import sys
import time
from urllib import FancyURLopener
import urllib2
import simplejson
# Define search term
searchTerm = "hello world"
# Replace spaces ' ' in search term for '%20' in order to comply with request
searchTerm = searchTerm.replace(' ','%20')
# Start FancyURLopener with defined version
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
# Set count to 0
count= 0
for i in range(0,10):
# Notice that the start changes for each iteration in order to request a new set of images for each loop
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0&q='+searchTerm+'&start='+str(i*4)+'&userip=MyIP')
print url
request = urllib2.Request(url, None, {'Referer': 'testing'})
response = urllib2.urlopen(request)
# Get results using JSON
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
# Iterate for each result and get unescaped url
for myUrl in dataInfo:
count = count + 1
print myUrl['unescapedUrl']
myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
# Sleep for one second to prevent IP blocking from Google
time.sleep(1)
You can also find very useful information here.
The Google Image Search API is deprecated, we use google search to download the images using REgex and Beautiful soup
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)))
image_type = "Action"
# you can change the query for the image here
query = "Terminator 3 Movie"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/searches_sm=122&source=lnms&tbm=isch&sa=X&ei=4r_cVID3NYayoQTb4ICQBA&ved=0CAgQ_AUoAQ&biw=1242&bih=619&q="+query
print url
header = {'User-Agent': 'Mozilla/5.0'}
soup = get_soup(url,header)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})]
#print images
for img in images:
raw_img = urllib2.urlopen(img).read()
#add the directory for your image here
DIR="C:\Users\hp\Pictures\\valentines\\"
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print cntr
f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb')
f.write(raw_img)
f.close()

Categories