Retrieving all information from page BeautifulSoup - python

I am attempting to scrape the urls of the products on an OldNavy webpage. However, it is only giving parts of the products list instead of the whole thing (for example, giving only 8 urls when there are way more than 8). I was hoping someone could help and identify what the problem may be.
from bs4 import BeautifulSoup
from selenium import webdriver
import html5lib
import platform
import urllib
import urllib2
import json
link = http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
bigDiv = soup.findAll("div", class_="sp_sm spacing_small")
for div in bigDiv:
links = div.findAll("a")
for i in links:
j = j + 1
productUrl = base_url + i["href"]
print productUrl

This page uses JavaScript to load elements but it loads only when you scroll down page.
It is called "lazy loading".
You have to scroll page too.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
link = "http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true"
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
# ---
# scrolling
lastHeight = driver.execute_script("return document.body.scrollHeight")
#print(lastHeight)
pause = 0.5
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
#print(lastHeight)
# ---
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
#driver.find_element_by_class_name
divs = soup.find_all("div", class_="sp_sm spacing_small")
for div in divs:
links = div.find_all("a")
for link in links:
print base_url + link["href"]
Idea: https://stackoverflow.com/a/28928684/1832058

Related

Reddit Community List using Python

I am trying to scrap the Reddit Data using Python. The result I got is only for a single subreddit information not for the complete list.
What I Tried:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url="https://www.reddit.com/"
driver.get(url)
Communities=[]
#content = driver.page_source
time.sleep(15)
driver.find_element("id", "header-search-bar").send_keys("BTC")
time.sleep(5)
driver.find_element("id", "header-search-bar").send_keys(Keys.ENTER)
time.sleep(5)
community=driver.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[1]/div/div[1]/a[3]/button')
community.click()
time.sleep(10)
colist=driver.find_elements('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
for comunity in colist:
#getting all the Communities
Name=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
Members=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/p/span')
Description=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/p')
# Saving community info
community_info = [Name.text, Members.text, Description.text]
Communities.append(community_info)
driver.quit()
communitydf = pd.DataFrame(Communities)
communitydf.columns = ['Community', 'Members', 'Description']
communitydf.to_csv('community_details.csv', index=False)
time.sleep(5)
What I Want:
The above code only fetches the first record, but I want to access all the subreddits which I get from the search query. I am new to Python and I think I mix-up the logic.
Any help will be appreciated.
Firstly, you do not wait for all the communities to load, for this you need to scroll the page to the end. Secondly, you are looking for the same xpath, which ll always only point to a specific element
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
def ger_communities(name: str):
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url = f"https://www.reddit.com/search/?q={name}&type=sr"
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
communities = []
soup = BeautifulSoup(driver.page_source, 'lxml')
for x in soup.find('div', {'data-testid': 'communities-list'}).find_all('a', {'data-testid': 'subreddit-link'}):
communities.append({
'Name': x.find('h6').get_text(),
'Members': x.find('span').get_text(),
'Description': x.find_all('p')[-1].get_text()
})
return communities
df = pd.DataFrame(ger_communities('BTC'))
df.to_csv('community_details.csv', index=False)
But i reccomend use Reddit API

Pages take 2 loadings to complete

On this website, https://toptees.store/linux-funny-cloud-computing I try to scrape sold span text but this website takes 2 time load to coming complete website. That's why data is not scraped.
My Code:
import requests
from bs4 import BeautifulSoup
url = "https://toptees.store/linux-funny-cloud-computing"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'lxml')
sold = soup.find_all("span", class_='ng-binding')
print(sold)
I also tried with selenium with Beautifulsoup
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # ,options=options
filepath = 'urls.txt'
with open(filepath) as f:
urls = [i.strip() for i in f.readlines()]
titles = []
for url in urls:
driver.get(url)
driver.maximize_window()
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'lxml')
sold = soup.find('span', class_="ng-binding")
print(sold)
The output is coming like this [] . How can I scrape this link with Beautifulsoup?
This is one way to get that information you're after:
import time as t
from bs4 import BeautifulSoup as bs
import undetected_chromedriver as uc
options = uc.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument('--disable-notifications')
options.add_argument("--window-size=1280,720")
# options.add_argument('--headless')
browser = uc.Chrome(options=options)
url = 'https://toptees.store/linux-funny-cloud-computing'
browser.get(url)
t.sleep(1)
browser.get(url)
t.sleep(7)
soup = bs(browser.page_source, 'html.parser')
sold = soup.select_one('days-available[any-sold="campaign.sold"]')
title = soup.select_one('h1.campaign-name-title')
print(title.text, '|', [x for x in sold.text.split(' ') if len(x.strip()) > 0][0])
Result printed in terminal:
Linux funny Cloud Computing | 43
For undetected_chromedriver, please see https://pypi.org/project/undetected-chromedriver/ [instructions on how to set it up, etc]

Scraping next pages issue with selenium

I am trying to scrape basic information on google. The code that I am using is the following. Unfortunately it does not move to the next page and I am not figuring the reason why. I am using selenium and google chrome as browser (no firefox). Could you please tell me what is wrong in my code?
driver.get('https://www.google.com/advanced_search?q=google&tbs=cdr:1,cd_min:3/4/2020,cd_max:3/4/2020&hl=en')
search = driver.find_element_by_name('q')
search.send_keys('tea')
search.submit()
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
titles = []
while True:
next_page_btn =driver.find_elements_by_xpath("//a[#id='pnnext']")
for r in result_div:
if len(next_page_btn) <1:
print("no more pages left")
break
else:
try:
title = None
title = r.find('h3')
if isinstance(title,Tag):
title = title.get_text()
print(title)
if title != '' :
titles.append(title)
except:
continue
element =WebDriverWait(driver,5).until(expected_conditions.element_to_be_clickable((By.ID,'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
I set q in the query string to be an empty string. Used as_q not q for the search box name. And reordered your code a bit. I put a page limit in to stop it going on forever.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
driver = webdriver.Chrome()
driver.get('https://www.google.com/advanced_search?q=&tbs=cdr:1,cd_min:3/4/2020,cd_max:3/4/2020&hl=en')
search = driver.find_element_by_name('as_q')
search.send_keys('tea')
search.submit()
titles = []
page_limit = 5
page = 0
while True:
soup = BeautifulSoup(driver.page_source, 'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
for r in result_div:
for title in r.find_all('h3'):
title = title.get_text()
print(title)
titles.append(title)
next_page_btn = driver.find_elements_by_id('pnnext')
if len(next_page_btn) == 0 or page > page_limit:
break
element = WebDriverWait(driver, 5).until(expected_conditions.element_to_be_clickable((By.ID, 'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
page = page + 1
driver.quit()

Scraping all results from page with BeautifulSoup

**Update**
===================================================
Ok guys, so far so good. I have code that allows me to scrape images, but it stores them in a strange way. It downloads first 40+ images, then creates another 'kittens' folder within previously created 'kittens' folder and starts over (downloading the same images as in first folder). How can I change it? Here is the code:
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup as soup
import requests
import time
import os
image_tags = []
driver = webdriver.Chrome()
driver.get(url='https://www.pexels.com/search/kittens/')
last_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
else:
last_height = new_height
sp = soup(driver.page_source, 'html.parser')
for img_tag in sp.find_all('img'):
image_tags.append(img_tag)
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
with open('kitten-{}.jpg'.format(x), 'wb') as f:
f.write(requests.get(url).content)
x += 1
except:
pass
===========================================================================
im trying to write a spider to scrape images of kittens from some page. I've got small problem, because my spider only gets first 15 images. I know it's probably because the page is loading more images after scrolling down. How can I resolve this issue?
Here is the code:
import requests
from bs4 import BeautifulSoup as bs
import os
url = 'https://www.pexels.com/search/cute%20kittens/'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
image_tags = soup.findAll('img')
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
with open('kitten-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
except:
pass
Since the site is dynamic, you need to use a browser manipulation tool such as selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
for url in image_urls:
f.write('{}\n'.format(url))

Selenium no another page

I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break

Categories