I tried to crawl this all video title name, but didn't get results, and bob up a bunch of errors, is there someone can tell me which parts I made is wrong
import requests
from bs4 import BeautifulSoup
import operator
def start(url):
word_list = []
source_code = requests.get(url).text
soup = BeautifulSoup(source_code,"html.parser")
for post_text in soup.findAll('a',{'class':'yt-simple-endpoint style-scope ytd-grid-video-
renderer'}):
content = post_text.string
words = content.lower().split()
for each_word in words:
print(each_word)
word_list.append(each_word)
start('https://www.youtube.com/c/DolceVitaChannel/videos')
I fixed some issues in your code and tried again with a header but didn't work. So I decided to use selenium. However, it worked. Here is the code.
from selenium import webdriver
firefox_options = webdriver.FirefoxOptions()
#firefox_options.add_argument('--headless') # uncomment to use headless mode
driver = webdriver.Firefox(executable_path='geckodriver', firefox_options=firefox_options)
def start(url):
word_list = []
driver.get(url)
for post_text in driver.find_elements_by_id('video-title'):
content = post_text.text
words = content.lower().split()
for each_word in words:
print(each_word)
word_list.append(each_word)
start('https://www.youtube.com/c/DolceVitaChannel/videos')
driver.close()
When you want to crawl on a site like youtube or instagram, you have a problem with javascript code, which requires you to be on the site at the same time with a browser to crawl what you want.
But the solutions are:
1 - In these cases, you can go to your browser's network section and check the requests that are sent to the server and look for the response you want (in this case, I did not find what I was looking for).
2 - If youtube gives you the API you want, you can use it or you can search for a simpler way and look for a python package that can work with youtube api, such as steam library [You can search because python just doesn't have a library to revive a dead person :))]
3 - and My favorite solution is to use the powerful selenium framework. You can both open your browser and do so with the headless ability without opening your browser.
You can use the following code, which is a combination of selenium framework and beautifulsoup library
from selenium import webdriver
from bs4 import BeautifulSoup
driver_path = r'geckodriver path'
firefox_options = webdriver.FirefoxOptions()
firefox_options.add_argument('--headless')
driver = webdriver.Firefox(executable_path=driver_path, firefox_options=firefox_options)
def start(url):
titles_list = []
driver.get(url)
soup = BeautifulSoup(driver.page_source,"html.parser")
posts_div = soup.find_all('div',{'class':'style-scope ytd-grid-renderer'})
posts = posts_div[0].find_all("ytd-grid-video-renderer")
for post in posts:
details = post.find("div", attrs={"id": "details"})
post_title = details.find("a").text
titles_list.append(post_title)
return titles_list
words = start('https://www.youtube.com/c/DolceVitaChannel/videos')
driver.close()
print("\n\n++++++++++++++++++++++++++++++++++++++++\n\n")
for word in words:
print(word)
Related
I am attempting to pull the title and link of each so called raffle in the list on this website. However, when i try to scrape this data, it can't seem to be found.
I have tried scraping all links on the page, but I think these "boxes" may be loaded via javascript.
The results I am receiving are a few links unrelated to what I want to get. There should be 40+ links that show up in this list, but the majority are not showing. Any help would be great, been stuck on this for a while
For some reason, this link and many others aren't showing up when I am scraping:
my code:
def raffle_page_collection():
chrome_driver()
page = requests.get('https://www.soleretriever.com/yeezy-boost-350-v2-black/')
soup = BeautifulSoup(page.text,'html.parser')
product_header = soup.find('h1').text
product_colorway = soup.find('h2').text
product_sku_and_release_date_and_price = soup.find('h3').text
container = soup.find(class_='main-container')
raffles = container.find_all('a')
raffle_list = []
for items in raffles:
raffle_list.append(items.get('href'))
print(raffle_list)
You should try automation selenium library. it allows you to scrape dynamic rendering request(js or ajax) page data.
Try this:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
browser = webdriver.Chrome()
browser.get('https://www.soleretriever.com/yeezy-boost-350-v2-black/')
time.sleep(3)
soup = BeautifulSoup(browser.page_source,'html.parser')
product_header = soup.find('h1').text
product_colorway = soup.find('h2').text
product_sku_and_release_date_and_price = soup.find('h3').text
container = soup.find(class_='main-container')
raffles = container.find("div",{"class":"vc_pageable-slide-wrapper vc_clearfix"})
raffle_list = []
for items in raffles.find_all("a",href=True):
raffle_list.append(items.get('href'))
print(product_header)
print(product_colorway)
print(product_sku_and_release_date_and_price)
print(raffle_list)
O/P:
Yeezy Boost 350 v2 Black
Black/ Black/ Black
FU9006 | 07/06/19 | $220
['https://www.43einhalb.com/en/adidas-yeezy-boost-350-v2-black-328672#SR', 'https://www.adidas.co.uk/yeezy#SR', 'https://www.allikestore.com/default/adidas-yeezy-boost-350-v2-static-black-black-fu9006-105308.html#SR', 'https://archive1820.com/en/forms/6/yeezy-raffle#SR', 'https://drops.blackboxstore.com/blackbox_launches_view/catalog/product/view/id/22296/s/yeezy-boost-350-v2#SR', 'https://woobox.com/4szm9v#SR', 'https://launches.endclothing.com/product/yeezy-boost-350-v2-fu9006#SR', 'https://www.instagram.com/p/ByEIHSHDSY6/', 'https://www.instagram.com/p/ByFG1G0lWf7/', 'https://releases.footshop.com/adidas-yeezy-boost-350-v2-agqn6WoBJZ9y4RSnzw9G#SR', 'https://launches.goodhoodstore.com/launches/yeezy-boost-350-v2-black-33#SR', 'https://www.hervia.com/launches/yeezy-350#SR', 'https://www.hibbett.com/adidas-yeezy-350-v2-black-mens-shoe/M0991.html#SR', 'https://reporting.jdsports.co.uk/cgi-bin/msite?yeezy_comp+a+0+0+0+0+0&utm_source=RedEye&utm_medium=Email&utm_campaign=Yeezy%20Boost%20351%20Clay&utm_content=0905%20Yeezy%20Clay#SR', 'https://www.instagram.com/p/ByDnK6uH6kE/', 'https://www.nakedcph.com/yeezy-boost-v2-350-static-black-raffle/s/635#SR', 'https://www.instagram.com/p/ByIXT8zHvYz/', 'https://launches.sevenstore.com/launch/yeezy-boost-350-v2-black-4033024#SR', 'https://shelta.eu/news/adidas-yeezy-boost-350-v2-black-fu9006x#SR', 'https://www.instagram.com/p/ByDI_6JAfty/', 'https://www.sneakersnstuff.com/en/product/38889/adidas-yeezy-350-v2#SR', 'https://www.instagram.com/p/ByHtt3HFkE0/', 'https://www.instagram.com/p/ByCaKR7Cde1/', 'https://tres-bien.com/adidas-yeezy-boost-350-v2-black-fu9006-fw19#SR', 'https://yeezysupply.com/products/yeezy-boost-350-v2-black-june-7-2019#SR']
for chrome browser:
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
https://christopher.su/2015/selenium-chromedriver-ubuntu/
selenium tutorial
https://selenium-python.readthedocs.io/
I'm practicing web scraping with Python atm and I found a problem, I wanted to scrape one website that has a list of anime that I watched before but when I try to scrape it (via requests or selenium) it only gets around 30 of 110 anime names from the page.
Here is my code with selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get("https://anilist.co/user/Agusmaris/animelist/Completed")
data = BeautifulSoup(browser.page_source, 'lxml')
for title in data.find_all(class_="title"):
print(title.getText())
And when I run it, the page source only shows up until an anime called 'Golden time' when there are like 70 or more left that are in the page.
Thanks
Edit: Code that works now thanks to 'supputuri':
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Firefox()
driver.get("https://anilist.co/user/Agusmaris/animelist/Completed")
time.sleep(3)
footer = driver.find_element_by_css_selector("div.footer")
preY = 0
print(str(footer))
while footer.rect['y'] != preY:
preY = footer.rect['y']
footer.location_once_scrolled_into_view
print('loading')
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
for title in soup.find_all(class_="title"):
print(title.getText())
driver.close()
driver.quit()
ret = input()
Here is the solution.
Make sure to add import time
driver.get("https://anilist.co/user/Agusmaris/animelist/Completed")
time.sleep(3)
footer =driver.find_element_by_css_selector("div.footer")
preY =0
while footer.rect['y']!=preY:
preY = footer.rect['y']
footer.location_once_scrolled_into_view
time.sleep(1)
print(str(driver.page_source))
This will iterate until all the anime is loaded and then gets the page source.
Let us know if this was helpful.
So, this is the jist of what I get when I load the page source:
AniListwindow.al_token = 'E1lPa1kzYco5hbdwT3GAMg3OG0rj47Gy5kF0PUmH';Sorry, AniList requires Javascript.Please enable Javascript or http://outdatedbrowser.com>upgrade to a modern web browser.Sorry, AniList requires a modern browser.Please http://outdatedbrowser.com>upgrade to a newer web browser.
Since I know damn well that Javascript is enabled and my Chrome version is fully up to date, and the URL listed takes one to a nonsecure website to "download" a new version of your browser, I think this is a spam site. Not sure if you were aware of that when posting so I won't flag as such, but I wanted you and others who come across this to be aware.
I'm trying to scrape website traffic from semrush.com.
my current code using BeautifulSoup is:
from bs4 import BeautifulSoup, BeautifulStoneSoup
import urllib
import json
req = urllib.request.Request('https://www.semrush.com/info/burton.com', headers={'User-Agent':'Magic Browser'})
response = urllib.request.urlopen(req)
raw_data = response.read()
response.close()
soup = BeautifulSoup(raw_data)
I've been trying data = soup.findAll("a", {"href":"/info/burton.com+(by+organic)"}) or data = soup.findAll("span", {"class":"sem-report-counter"}) without much luck.
I can see the numbers on the webpage that I would like to get. Is there a way to pull this information off? I'm not seeing it in the html I pull.
I went the extra mile and set up a working example of how you can use selenium to scrape that page. Install selenium and try it out!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = 'https://www.semrush.com/info/burton.com' #your url
options = Options() #set up options
options.add_argument('--headless') #add --headless mode to options
driver = webdriver.Chrome(executable_path='/opt/ChromeDriver/chromedriver',
chrome_options=options)
#note: executable_path will depend on where your chromedriver.exe is located
driver.get(url) #get response
driver.implicitly_wait(1) #wait to load content
elements = driver.find_elements_by_xpath(xpath='//a[#href="/info/burton.com+(by+organic)"]') #grab that stuff you wanted?
for e in elements: print(e.get_attribute('text').strip()) #print text fields
driver.quit() #close the driver when you're done
Output that I see in my terminal:
356K
6.5K
59.3K
$usd305K
Organic keywords
Organic
Top Organic Keywords
View full report
Organic Position Distribution
I was able to scrape the following website before using "driver = webdriver.PhantomJS()" for work reason. What I was scraping were the price and the date.
https://www.cash.ch/fonds/swisscanto-ast-avant-bvg-portfolio-45-p-19225268/swc/chf
This stopped working some days ago due to a disclaimer page which I have to agree at first.
https://www.cash.ch/fonds-investor-disclaimer?redirect=fonds/swisscanto-ast-avant-bvg-portfolio-45-p-19225268/swc/chf
Once agreed I visually saw the real content, however the driver seems not, print out is [], so it must be still with the url of the disclaimer.
Please see code below.
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
driver = webdriver.PhantomJS()
driver.set_window_size(1120, 550)
#Swisscanto
driver.get("https://www.cash.ch/fonds/swisscanto-ast-avant-bvg- portfolio-45-p-19225268/swc/chf")
s_swisscanto = BeautifulSoup(driver.page_source, 'lxml')
nav_sc = s_swisscanto.find_all('span', {"data-field-entry": "value"})
date_sc = s_swisscanto.find_all('span', {"data-field-entry": "datetime"})
print(nav_sc)
print(date_sc)
print("Done Swisscanton")
This should work (I think the button you want to click in zustimmen?)
driver = webdriver.PhantomJS()
driver.get("https://www.cash.ch/fonds/swisscanto-ast-avant-bvg-portfolio-45-p-19225268/swc/chf"
accept_button = driver.find_element_by_link_text('zustimmen')
accept_button.click()
content = driver.page_source
More details here
python selenium click on button
I am learning Python scraping technique but I am stuck with the problem of scraping an Ajax page like this one.
I want to scrape all the medicines name and details coming in the page. Since I read most of the answer on the stack overflow but I am not getting the right data after scraping. I also tried to scrape using selenium or send a forge post request but it failed.
So please help me on this Ajax scraping topic specially this page because ajax is triggered on selecting an option from dropdown options.
Also please provide me with some resources for ajax page scraping.
//using selenium
from selenium import webdriver
import bs4 as bs
import lxml
import requests
path_to_chrome = '/home/brutal/Desktop/chromedriver'
browser = webdriver.Chrome(executable_path = path_to_chrome)
url = 'https://www.gianteagle.com/Pharmacy/Savings/4-10-Dollar-Drug-Program/Generic-Drug-Program/'
browser.get(url)
browser.find_element_by_xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option[contains(text(), "Ohio")]').click()
new_url = browser.current_url
r = requests.get(new_url)
print(r.content)
ChromeDriver you can download here
normalize-space is used in order to remove trash from web text, such as x0
from time import sleep
from selenium import webdriver
from lxml.html import fromstring
data = {}
driver = webdriver.Chrome('PATH TO YOUR DRIVER/chromedriver') # i.e '/home/superman/www/myproject/chromedriver'
driver.get('https://www.gianteagle.com/Pharmacy/Savings/4-10-Dollar-Drug-Program/Generic-Drug-Program/')
# Loop states
for i in range(2, 7):
dropdown_state = driver.find_element(by='id', value='ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList')
# open dropdown
dropdown_state.click()
# click state
driver.find_element_by_xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option['+str(i)+']').click()
# let download the page
sleep(3)
# prepare HTML
page_content = driver.page_source
tree = fromstring(page_content)
state = tree.xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option['+str(i)+']/text()')[0]
data[state] = []
# Loop products inside the state
for line in tree.xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_gridSearchResults"]/tbody/tr[#style]'):
med_type = line.xpath('normalize-space(.//td[#class="medication-type"])')
generic_name = line.xpath('normalize-space(.//td[#class="generic-name"])')
brand_name = line.xpath('normalize-space(.//td[#class="brand-name hidden-xs"])')
strength = line.xpath('normalize-space(.//td[#class="strength"])')
form = line.xpath('normalize-space(.//td[#class="form"])')
qty_30_day = line.xpath('normalize-space(.//td[#class="30-qty"])')
price_30_day = line.xpath('normalize-space(.//td[#class="30-price"])')
qty_90_day = line.xpath('normalize-space(.//td[#class="90-qty hidden-xs"])')
price_90_day = line.xpath('normalize-space(.//td[#class="90-price hidden-xs"])')
data[state].append(dict(med_type=med_type,
generic_name=generic_name,
brand_name=brand_name,
strength=strength,
form=form,
qty_30_day=qty_30_day,
price_30_day=price_30_day,
qty_90_day=qty_90_day,
price_90_day=price_90_day))
print('data:', data)
driver.quit()