The script should download images from IG and it seems to fail locate the url src of the image. Even with defining the class name. This code used to work last year and all I did is changed the class name and updated driver.find_elements() since it was changed in selenium. I am getting an error:
requests.exceptions.MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
If I add print(img.get_attribute("src")) I get None.
Full code:
import requests
import selenium.webdriver as webdriver
import time
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements(By.CLASS_NAME, "_aagv")
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print(img.get_attribute("src"))
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
Any ideas why this would behave like that?
Related
I'm working on a project and I'm trying to extract the pictures' URL from a website.
I try multiple solution on this below code but no solution help me out.
In this I want separated products images.
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedrive/chromedriver.exe')
wshoes = []
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
time.sleep(7)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte = soup.find_all('div',class_='product-card__body')
images = conte.find('img', {'class':'product-card__hero-image css-1fxh5tw'},src=True)
print(images)
You can try using selenium:
images = driver.find_elements_by_tag_name('img')
for image in images:
imageClass = image.get_attribute('class')
if imageClass == 'product-card__hero-image css-1fxh5tw':
print(image)
I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedriver.exe')
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this dowsnt work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte= soup.find_all('div',class_='product-card__body')
wshoes=[]
for items in conte:
try:
title= items.find('div',class_ = 'product-card__title').text
except:
title=''
try:
sub_title = items.find('div',class_ ='product-card__subtitle').text
except:
sub_title=''
try:
color = items.find('div',{'class':'product-card__product-count'}).text
except:
color=''
try:
link = items.find('a', {'class': 'product-card__link-overlay'})['href']
except:
link=''
try:
price=items.select_one('div[data-test="product-price"]').text.strip()
#item.find('div',{'class':'product-price is--current-price css-s56yt7'}).text
except:
price='-'
try:
reduce_price=items.select_one('div[data-test="product-price-reduced"]').text.strip()
#item.find('div',class_ ='product-price-reduced').text
except:
reduce_price='-'
print(title,sub_title,color,price,reduce_price,link)
shoes={
'title':title,
'Description':sub_title,
'QuatityColor':color,
'Price':price,
'Reducedprice':reduce_price,
'Url':link
}
wshoes.append(shoes)
df = pd.DataFrame(wshoes)
print(df)
df.to_csv('Nike.csv')
print('Saved to csv file')
DevTools listening on ws://127.0.0.1:58524/devtools/browser/e07b59df-6056-4144-9203-2feb91b19647
[21028:20948:0301/203812.684:ERROR:device_event_log_impl.cc(211)] [20:38:12.685] USB: usb_device_handle_win.cc:1049 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[21028:20948:0301/203812.686:ERROR:device_event_log_impl.cc(211)] [20:38:12.687] USB: usb_device_handle_win.cc:1049 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[18068:9236:0301/203825.199:ERROR:ssl_client_socket_impl.cc(962)] handshake failed; returned -1, SSL error code 1, net_error -101
The problem you appear to be having is that in some cases, your test of new_height == last_height is probably succeeding on the first attempt. As such, the variable conte is not assigned.
To get around this, initialise it to None and don't break unless it has been assigned.
For example:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedriver.exe')
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte = soup.find_all('div',class_='product-card__body')
wshoes = []
for items in conte:
try:
title= items.find('div',class_ = 'product-card__title').text
except:
title=''
try:
sub_title = items.find('div',class_ ='product-card__subtitle').text
except:
sub_title=''
try:
color = items.find('div',{'class':'product-card__product-count'}).text
except:
color=''
try:
link = items.find('a', {'class': 'product-card__link-overlay'})['href']
except:
link=''
try:
price=items.select_one('div[data-test="product-price"]').text.strip()
#item.find('div',{'class':'product-price is--current-price css-s56yt7'}).text
except:
price='-'
try:
reduce_price=items.select_one('div[data-test="product-price-reduced"]').text.strip()
#item.find('div',class_ ='product-price-reduced').text
except:
reduce_price='-'
print(title,sub_title,color,price,reduce_price,link)
shoes={
'title':title,
'Description':sub_title,
'QuatityColor':color,
'Price':price,
'Reducedprice':reduce_price,
'Url':link
}
wshoes.append(shoes)
df = pd.DataFrame(wshoes)
print(df)
df.to_csv('Nike.csv')
print('Saved to csv file')
Another possible solution would be to move the soup = and conte = lines outside the loop.
I have this script to download images from Instagram. The only issue I am having is that when Selenium starts scrolling down to the bottom of the webpage, BeautifulSoup starts grabbing the same img src links after requests is being looped.
Although it will continue to scroll down and download pictures, after all that is done, I end up having 2 or 3 duplicates. So my question is is there a way of preventing this duplication from happening?
import requests
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
url = ('https://www.instagram.com/kitties')
driver = webdriver.Firefox()
driver.get(url)
scroll_delay = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
if new_height == last_height:
break
last_height = new_height
Update:
So I placed this part of the code outside of while True and let selenium load the whole page first in order to hopefully have bs4 scrape all the images. It works to number 30 only and then stops.
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
#tn = datetime.now().strftime('%H:%M:%S')
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
The reason it only loads 30 in your second version of your script is because the rest of the elements are removed from the page DOM and are no longer part of the source that BeautifulSoup sees. The solution is to keep doing what you were doing the first time, but to remove any duplicate elements before you iterate through the list and call screens(). You can do this using sets as below, though I'm not sure if this is the absolute most efficient way to do it:
import requests
import selenium.webdriver as webdriver
import time
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats/?hl=en')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("test_images/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements_by_class_name('_2di5p')
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
As you can see, I used a different page to test it, one with 420 images of cats. The result was 420 images, the number of posts on that account, with no duplicates among them.
I would use os library to check if file already exists
import os
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
if os.path.isfile(path/to/the/file): #checks file exists. Gives false on directory
# or if os.path.exists(path/to/the/file): #checks file/directory exists
pass
else:
r = requests.get(img_url)
f.write(r.content)
*I might have messed up the ordering of if and with statements
**Update**
===================================================
Ok guys, so far so good. I have code that allows me to scrape images, but it stores them in a strange way. It downloads first 40+ images, then creates another 'kittens' folder within previously created 'kittens' folder and starts over (downloading the same images as in first folder). How can I change it? Here is the code:
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup as soup
import requests
import time
import os
image_tags = []
driver = webdriver.Chrome()
driver.get(url='https://www.pexels.com/search/kittens/')
last_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
else:
last_height = new_height
sp = soup(driver.page_source, 'html.parser')
for img_tag in sp.find_all('img'):
image_tags.append(img_tag)
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
with open('kitten-{}.jpg'.format(x), 'wb') as f:
f.write(requests.get(url).content)
x += 1
except:
pass
===========================================================================
im trying to write a spider to scrape images of kittens from some page. I've got small problem, because my spider only gets first 15 images. I know it's probably because the page is loading more images after scrolling down. How can I resolve this issue?
Here is the code:
import requests
from bs4 import BeautifulSoup as bs
import os
url = 'https://www.pexels.com/search/cute%20kittens/'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
image_tags = soup.findAll('img')
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
with open('kitten-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
except:
pass
Since the site is dynamic, you need to use a browser manipulation tool such as selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
for url in image_urls:
f.write('{}\n'.format(url))