Selenium scrolling and scraping with BeautifulSoup produces duplicate results - python

I have this script to download images from Instagram. The only issue I am having is that when Selenium starts scrolling down to the bottom of the webpage, BeautifulSoup starts grabbing the same img src links after requests is being looped.
Although it will continue to scroll down and download pictures, after all that is done, I end up having 2 or 3 duplicates. So my question is is there a way of preventing this duplication from happening?
import requests
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
url = ('https://www.instagram.com/kitties')
driver = webdriver.Firefox()
driver.get(url)
scroll_delay = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
if new_height == last_height:
break
last_height = new_height
Update:
So I placed this part of the code outside of while True and let selenium load the whole page first in order to hopefully have bs4 scrape all the images. It works to number 30 only and then stops.
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
#tn = datetime.now().strftime('%H:%M:%S')
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1

The reason it only loads 30 in your second version of your script is because the rest of the elements are removed from the page DOM and are no longer part of the source that BeautifulSoup sees. The solution is to keep doing what you were doing the first time, but to remove any duplicate elements before you iterate through the list and call screens(). You can do this using sets as below, though I'm not sure if this is the absolute most efficient way to do it:
import requests
import selenium.webdriver as webdriver
import time
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats/?hl=en')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("test_images/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements_by_class_name('_2di5p')
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
As you can see, I used a different page to test it, one with 420 images of cats. The result was 420 images, the number of posts on that account, with no duplicates among them.

I would use os library to check if file already exists
import os
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
if os.path.isfile(path/to/the/file): #checks file exists. Gives false on directory
# or if os.path.exists(path/to/the/file): #checks file/directory exists
pass
else:
r = requests.get(img_url)
f.write(r.content)
*I might have messed up the ordering of if and with statements

Related

Can't extract src attribute from "img" from Python

I'm working on a project and I'm trying to extract the pictures' URL from a website.
I try multiple solution on this below code but no solution help me out.
In this I want separated products images.
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedrive/chromedriver.exe')
wshoes = []
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
time.sleep(7)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte = soup.find_all('div',class_='product-card__body')
images = conte.find('img', {'class':'product-card__hero-image css-1fxh5tw'},src=True)
print(images)
You can try using selenium:
images = driver.find_elements_by_tag_name('img')
for image in images:
imageClass = image.get_attribute('class')
if imageClass == 'product-card__hero-image css-1fxh5tw':
print(image)

Python selenium no schema supplied while downloading images

The script should download images from IG and it seems to fail locate the url src of the image. Even with defining the class name. This code used to work last year and all I did is changed the class name and updated driver.find_elements() since it was changed in selenium. I am getting an error:
requests.exceptions.MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
If I add print(img.get_attribute("src")) I get None.
Full code:
import requests
import selenium.webdriver as webdriver
import time
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements(By.CLASS_NAME, "_aagv")
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print(img.get_attribute("src"))
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
Any ideas why this would behave like that?

How to scrape all the comments of a youtube video using selenium, python

I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))

scroll to bottom of page before scraping with selenium

There is a web page that I want to run my scraping script on. However, because the page refreshes with additional content when you scroll down, I need to be able to add a function to my script that scrolls the web page all the way to the bottom before my scraping script is run.
In attempt to achieve this, please find my entire script which seems to stop at row height 5287.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd
#Initialize a Chrome browser
driver = webdriver.Chrome("C:.............chromedriver_win32/chromedriver.exe")
#Go to the page we want to scrape
driver.get('https://icodrops.com/category/ended-ico/')
#Open csv file to write in
csv_file = open('icodrops_ended_icos.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Project_Name', 'Interest', 'Category', 'Received', 'Goal', 'End_Date', 'Ticker'])
page_url = 'https://icodrops.com/category/ended-ico/'
# Although only one page to scrape - need to scroll to the bottom to pull all data
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(15)
#height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
try:
#print the url that we are scraping
print('Scraping this url:' + page_url)
#Exract a list object where each element of the list is a row in the table
rows = driver.find_elements_by_xpath('//div[#class="col-md-12 col-12 a_ico"]')
# Extract detail in columns from each row
for row in rows:
#Initialize a dictionary for each row
row_dict = {}
#Use relative xpaths to locate desired data
project_name = row.find_element_by_xpath('.//div[#class="ico-row"]/div[2]/h3/a').text
interest = row.find_element_by_xpath('.//div[#class="interest"]').text
category = row.find_element_by_xpath('.//div[#class="categ_type"]').text
received = row.find_element_by_xpath('.//div[#id="new_column_categ_invisted"]/span').text
goal = row.find_element_by_xpath('.//div[#id="categ_desctop"]').text
end_date = row.find_element_by_xpath('.//div[#class="date"]').text
ticker = row.find_element_by_xpath('.//div[#id="t_tikcer"]').text
# Add extracted data to the dictionary
row_dict['project_name'] = project_name
row_dict['interest'] = interest
row_dict['category'] = category
row_dict['received'] = received
row_dict['goal'] = goal
row_dict['end_date'] = end_date
row_dict['ticker'] = ticker
writer.writerow(row_dict.values())
except Exception as e:
print(e)
csv_file.close()
driver.close()
break
Without being able to scroll to the bottom of the page my script will only scrape data form the initial page which only constitutes about 10% of all that is available
I always use the below piece of code to scroll till bottom, and I have never seen that it fail.
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
So, your effective code will be
while True:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("window.scrollTo(0, " + str(height) + ");")
time.sleep(15)
if newHeight == lastHeight:
break
lastHeight = newHeight
If you use print() to see values in variables then you see that scrollTo gives None and you can't use it to get newHeight.
Minimal working code.
I tested on page http://quotes.toscrape.com/scroll created for learning scraping.
from selenium import webdriver
import time
url = 'http://quotes.toscrape.com/scroll'
driver = webdriver.Firefox()
driver.get(url)
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(1)
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
BTW:
I found on Stackoverflow answer from 2015 which use the same method but with document.body instead of document.documentElement
How can I scroll a web page using selenium webdriver in python?
So if this code works for you then this question could be closed as duplicate

Scraping all results from page with BeautifulSoup

**Update**
===================================================
Ok guys, so far so good. I have code that allows me to scrape images, but it stores them in a strange way. It downloads first 40+ images, then creates another 'kittens' folder within previously created 'kittens' folder and starts over (downloading the same images as in first folder). How can I change it? Here is the code:
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup as soup
import requests
import time
import os
image_tags = []
driver = webdriver.Chrome()
driver.get(url='https://www.pexels.com/search/kittens/')
last_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
else:
last_height = new_height
sp = soup(driver.page_source, 'html.parser')
for img_tag in sp.find_all('img'):
image_tags.append(img_tag)
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
with open('kitten-{}.jpg'.format(x), 'wb') as f:
f.write(requests.get(url).content)
x += 1
except:
pass
===========================================================================
im trying to write a spider to scrape images of kittens from some page. I've got small problem, because my spider only gets first 15 images. I know it's probably because the page is loading more images after scrolling down. How can I resolve this issue?
Here is the code:
import requests
from bs4 import BeautifulSoup as bs
import os
url = 'https://www.pexels.com/search/cute%20kittens/'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
image_tags = soup.findAll('img')
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
with open('kitten-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
except:
pass
Since the site is dynamic, you need to use a browser manipulation tool such as selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
for url in image_urls:
f.write('{}\n'.format(url))

Categories