Scraping all results from page with BeautifulSoup

Scraping all results from page with BeautifulSoup - python

**Update**
===================================================
Ok guys, so far so good. I have code that allows me to scrape images, but it stores them in a strange way. It downloads first 40+ images, then creates another 'kittens' folder within previously created 'kittens' folder and starts over (downloading the same images as in first folder). How can I change it? Here is the code:
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup as soup
import requests
import time
import os
image_tags = []
driver = webdriver.Chrome()
driver.get(url='https://www.pexels.com/search/kittens/')
last_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
else:
last_height = new_height
sp = soup(driver.page_source, 'html.parser')
for img_tag in sp.find_all('img'):
image_tags.append(img_tag)
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
with open('kitten-{}.jpg'.format(x), 'wb') as f:
f.write(requests.get(url).content)
x += 1
except:
pass
===========================================================================
im trying to write a spider to scrape images of kittens from some page. I've got small problem, because my spider only gets first 15 images. I know it's probably because the page is loading more images after scrolling down. How can I resolve this issue?
Here is the code:
import requests
from bs4 import BeautifulSoup as bs
import os
url = 'https://www.pexels.com/search/cute%20kittens/'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
image_tags = soup.findAll('img')
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
with open('kitten-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
except:
pass

Since the site is dynamic, you need to use a browser manipulation tool such as selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
for url in image_urls:
f.write('{}\n'.format(url))

Related

Can't extract src attribute from "img" from Python

I'm working on a project and I'm trying to extract the pictures' URL from a website.
I try multiple solution on this below code but no solution help me out.
In this I want separated products images.
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedrive/chromedriver.exe')
wshoes = []
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
time.sleep(7)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte = soup.find_all('div',class_='product-card__body')
images = conte.find('img', {'class':'product-card__hero-image css-1fxh5tw'},src=True)
print(images)

You can try using selenium:
images = driver.find_elements_by_tag_name('img')
for image in images:
imageClass = image.get_attribute('class')
if imageClass == 'product-card__hero-image css-1fxh5tw':
print(image)

Python selenium no schema supplied while downloading images

The script should download images from IG and it seems to fail locate the url src of the image. Even with defining the class name. This code used to work last year and all I did is changed the class name and updated driver.find_elements() since it was changed in selenium. I am getting an error:
requests.exceptions.MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
If I add print(img.get_attribute("src")) I get None.
Full code:
import requests
import selenium.webdriver as webdriver
import time
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements(By.CLASS_NAME, "_aagv")
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print(img.get_attribute("src"))
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
Any ideas why this would behave like that?

Reddit Community List using Python

I am trying to scrap the Reddit Data using Python. The result I got is only for a single subreddit information not for the complete list.
What I Tried:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url="https://www.reddit.com/"
driver.get(url)
Communities=[]
#content = driver.page_source
time.sleep(15)
driver.find_element("id", "header-search-bar").send_keys("BTC")
time.sleep(5)
driver.find_element("id", "header-search-bar").send_keys(Keys.ENTER)
time.sleep(5)
community=driver.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[1]/div/div[1]/a[3]/button')
community.click()
time.sleep(10)
colist=driver.find_elements('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
for comunity in colist:
#getting all the Communities
Name=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
Members=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/p/span')
Description=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/p')
# Saving community info
community_info = [Name.text, Members.text, Description.text]
Communities.append(community_info)
driver.quit()
communitydf = pd.DataFrame(Communities)
communitydf.columns = ['Community', 'Members', 'Description']
communitydf.to_csv('community_details.csv', index=False)
time.sleep(5)
What I Want:
The above code only fetches the first record, but I want to access all the subreddits which I get from the search query. I am new to Python and I think I mix-up the logic.
Any help will be appreciated.

Firstly, you do not wait for all the communities to load, for this you need to scroll the page to the end. Secondly, you are looking for the same xpath, which ll always only point to a specific element
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
def ger_communities(name: str):
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url = f"https://www.reddit.com/search/?q={name}&type=sr"
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
communities = []
soup = BeautifulSoup(driver.page_source, 'lxml')
for x in soup.find('div', {'data-testid': 'communities-list'}).find_all('a', {'data-testid': 'subreddit-link'}):
communities.append({
'Name': x.find('h6').get_text(),
'Members': x.find('span').get_text(),
'Description': x.find_all('p')[-1].get_text()
})
return communities
df = pd.DataFrame(ger_communities('BTC'))
df.to_csv('community_details.csv', index=False)
But i reccomend use Reddit API

Selenium scrolling and scraping with BeautifulSoup produces duplicate results

I have this script to download images from Instagram. The only issue I am having is that when Selenium starts scrolling down to the bottom of the webpage, BeautifulSoup starts grabbing the same img src links after requests is being looped.
Although it will continue to scroll down and download pictures, after all that is done, I end up having 2 or 3 duplicates. So my question is is there a way of preventing this duplication from happening?
import requests
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
url = ('https://www.instagram.com/kitties')
driver = webdriver.Firefox()
driver.get(url)
scroll_delay = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
if new_height == last_height:
break
last_height = new_height
Update:
So I placed this part of the code outside of while True and let selenium load the whole page first in order to hopefully have bs4 scrape all the images. It works to number 30 only and then stops.
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all('img', class_='_2di5p')
for img in imgs:
#tn = datetime.now().strftime('%H:%M:%S')
img_url = img["src"]
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1

The reason it only loads 30 in your second version of your script is because the rest of the elements are removed from the page DOM and are no longer part of the source that BeautifulSoup sees. The solution is to keep doing what you were doing the first time, but to remove any duplicate elements before you iterate through the list and call screens(). You can do this using sets as below, though I'm not sure if this is the absolute most efficient way to do it:
import requests
import selenium.webdriver as webdriver
import time
driver = webdriver.Firefox()
url = ('https://www.instagram.com/cats/?hl=en')
driver.get(url)
scroll_delay = 3
last_height = driver.execute_script("return document.body.scrollHeight")
counter = 0
print('[+] Downloading:\n')
def screens(get_name):
with open("test_images/img_{}.jpg".format(get_name), 'wb') as f:
r = requests.get(img_url)
f.write(r.content)
old_imgs = set()
while True:
imgs = driver.find_elements_by_class_name('_2di5p')
imgs_dedupe = set(imgs) - set(old_imgs)
for img in imgs_dedupe:
img_url = img.get_attribute("src")
print('=> [+] img_{}'.format(counter))
screens(counter)
counter = counter + 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_delay)
new_height = driver.execute_script("return document.body.scrollHeight")
old_imgs = imgs
if new_height == last_height:
break
last_height = new_height
driver.quit()
As you can see, I used a different page to test it, one with 420 images of cats. The result was 420 images, the number of posts on that account, with no duplicates among them.

I would use os library to check if file already exists
import os
def screens(get_name):
with open("/home/cha0zz/Desktop/photos/img_{}.jpg".format(get_name), 'wb') as f:
if os.path.isfile(path/to/the/file): #checks file exists. Gives false on directory
# or if os.path.exists(path/to/the/file): #checks file/directory exists
pass
else:
r = requests.get(img_url)
f.write(r.content)
*I might have messed up the ordering of if and with statements

Retrieving all information from page BeautifulSoup

I am attempting to scrape the urls of the products on an OldNavy webpage. However, it is only giving parts of the products list instead of the whole thing (for example, giving only 8 urls when there are way more than 8). I was hoping someone could help and identify what the problem may be.
from bs4 import BeautifulSoup
from selenium import webdriver
import html5lib
import platform
import urllib
import urllib2
import json
link = http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
bigDiv = soup.findAll("div", class_="sp_sm spacing_small")
for div in bigDiv:
links = div.findAll("a")
for i in links:
j = j + 1
productUrl = base_url + i["href"]
print productUrl

This page uses JavaScript to load elements but it loads only when you scroll down page.
It is called "lazy loading".
You have to scroll page too.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
link = "http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true"
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
# ---
# scrolling
lastHeight = driver.execute_script("return document.body.scrollHeight")
#print(lastHeight)
pause = 0.5
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
#print(lastHeight)
# ---
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
#driver.find_element_by_class_name
divs = soup.find_all("div", class_="sp_sm spacing_small")
for div in divs:
links = div.find_all("a")
for link in links:
print base_url + link["href"]
Idea: https://stackoverflow.com/a/28928684/1832058

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping all results from page with BeautifulSoup - python

Related

Can't extract src attribute from "img" from Python

Python selenium no schema supplied while downloading images

Reddit Community List using Python

Selenium scrolling and scraping with BeautifulSoup produces duplicate results

Retrieving all information from page BeautifulSoup

Categories

Resources