I am trying to scrap Google Hot Trends. I tried to run Chrome developer tools to capture all requests, but it seems there are no requests in or out. So I tried to use selenium, But I could not get the data due to many reasons the data is variable and change constantly. Here is the code I tried:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
Your code looks correct.
The only point I see here you are missing is: you have to extract the texts from web elements you get here.
Also I'd prefer printing the texts one by one, not all the array at once.
As following:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
I managed to solve the problem with the following code:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
Related
I am new joiner and doing self study for crawling. I tried to get the information from Disneyland
https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j
I tried to crawl the price from the website, but it return "None", the result should be HK$639
url5 = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
r = requests.get(url5)
sp = BeautifulSoup(r.content, 'html.parser')
door = sp.find('div', class_='container')
price = door.find('p', class_='price')
print(price)
My concept of beautifulSoup: It parse the website to be html and I can use find/find_all to find the information by using 'div', 'p', and by its class. Please correct me if it is wrong, thank you.
The page is loaded by JavaScript.So to pull out the desired data, you can use selenium with bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service,options=options)
url = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
driver.get(url)
#driver.maximize_window()
time.sleep(10)
soup=BeautifulSoup(driver.page_source, 'lxml')
price = soup.select_one('p:-soup-contains("General Admission:") > strong').text
print(price)
Output:
HK$639
I have to get <a class="last" aria-label="Last Page" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>.
From this site:https://webtoon-tr.com/webtoon/
But when i try to scrape it with this code:
from bs4 import BeautifulSoup
import requests
url = "https://webtoon-tr.com/webtoon/"
html = requests.get(url).content
soup = BeautifulSoup(html,"html.parser")
last = soup.find_all("a",{"class":"last"})
print(last)
It just returns me an empty list, and when i try to scrape all "a" tags it only returns 2 which are completly different things.
Can somebody help me about it ? I really appreciate it.
Try using the request_html library.
from bs4 import BeautifulSoup
import requests_html
url = "https://webtoon-tr.com/webtoon/"
s = requests_html.HTMLSession()
html = s.get(url)
soup = BeautifulSoup(html.content, "lxml")
last = soup.findAll("a", {"class":"last"})
print(last)
[<a aria-label="Last Page" class="last" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>]
Website is protected by Cloudflare. requests, cloudscraper or request_html doesn't work for me, only selenium:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://webtoon-tr.com/webtoon/")
soup = BeautifulSoup(browser.page_source, 'html5lib')
browser.quit()
link = soup.select_one('a.last')
print(link)
This returns
<a aria-label="Last Page" class="last" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>
So I want to scrape all the dates from the clash of stats, and there are multiple pages to it, and when you turn the page, the URL does not change. How do I scrape all the dates on which the player has joined a new clan?
The website:
https://www.clashofstats.com/players/pink-panther-VL029CJ2/history/log
My code now:
from emoji import UNICODE_EMOJI
import requests
from bs4 import BeautifulSoup
link = f'https://www.clashofstats.com/players/{"Pink Panther"}-{str("#VL029CJ2").replace("#", "")}/history/log'
link = link.replace("#", "%2523")
link = link.replace("#", "%2540")
link = link.replace(" ", "-")
print(link)
for i in name:
if i in UNICODE_EMOJI:
link = link.replace(i, "")
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
dates = soup.find_all(class_="start date")
print(dates)
You should use Selenium
pip install selenium
and download the Chrome Driver for example.
Then the code will look sth like this:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome('chromedriver.exe', options=options)
link = 'https://www.clashofstats.com/players/pink-panther-VL029CJ2/history/log'
driver.get(link)
while True:
try:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
dates = soup.find_all(class_="start date")
print(dates)
# next_page_link = driver.find_element_by_xpath('path_to_element')
# next_page_link = driver.find_elements_by_class_name('class_name')
print(next_page_link)
next_page_link.click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException) as e:
print("Last page reached")
break
driver.quit()
But you need to find XPath to element, or locate the element via class name
Locate elements in Selenium
I am trying to pull the p tag comments within a review card, eventually looping through a search on vivino.com through this link using BeautifulSoup and Selenium. I was able to open the first link but pulling the p text in the review boxes returns [].
url = "https://www.vivino.com/explore?e=eJwNyTEOgCAQBdHbbA2F5e-8gbE2uKyERBYCaOT20swrJlVYSlFhjaHkPixTHtg34pmVyvzhwutqlO5uyid8bJwf7UeRyqKdMrw0pgYdPwIzGwQ="
driver = webdriver.Chrome('/Users/myname/Downloads/chromedriver')
driver.implicitly_wait(30)
driver.get(url)
python_button = driver.find_element_by_class_name('anchor__anchor--2QZvA')
python_button.click()
soup = BeautifulSoup(driver.page_source, 'lxml')
print(soup.find_all('p'))
table = soup.findAll('div',attrs={"class":"reviewCard__reviewContainer--1kMJM"})
print(table)
driver.quit()
Could anybody advise on the correct way to pull the comments? Since there are more than 1 comment per page would I need to loop?
I also tried this with 'html.parser' instead of 'lxml'. Which is the correct one to use?
Thank you so much for your help.
Here is what you need to do:
import atexit
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_all_elements_located
from selenium.webdriver.support.wait import WebDriverWait
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def find_elements(driver, locator):
return WebDriverWait(driver, 10, 2).until(visibility_of_all_elements_located(locator))
URL = "https://www.vivino.com/explore?e=eJwNyTEOgCAQBdHbbA2F5e-8gbE2uKyERBYCaOT20swrJlVYSlFhjaHkPixTHtg34pmVyvzhwutqlO5uyid8bJwf7UeRyqKdMrw0pgYdPwIzGwQ="
RESULTS = By.CSS_SELECTOR, "div[class*='vintageTitle'] > a"
def main():
driver = start_driver()
driver.get(URL)
# note the results
wines = []
for element in find_elements(driver, RESULTS):
link = element.get_attribute("href")
name = element.find_element_by_css_selector("span[class*='vintageTitle__wine']").text
wines.append((name, link))
pprint(wines)
# go extract details from each result's page
for name, link in wines:
print("getting comments for wine: ", name)
driver.get(link)
# you can do the rest ;)
if __name__ == '__main__':
main()
I would to scrape this website : https://www.projets-environnement.gouv.fr/pages/home/
More precisely, I would like to collect the table in the div with id = table-wrapper.
My trouble is that I can't catch it with BeautifulSoup.
Here is my code :
url = 'https://www.projets-environnement.gouv.fr/pages/home/'
html = requests.get(url).text
soup = BeautifulSoup(html, "html5lib")
div_table = soup.findAll('div', id_='table-wrapper')
But div_table is a None object.
Is Selenium the solution ?
I think you should use selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
url = 'https://www.projets-environnement.gouv.fr/pages/home/'
options = Options()
options.headless = True
driver = webdriver.Firefox(firefox_options=options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
mytable = soup.find('div', id='table-wrapper')
and you get that table.
The correct way to call is:
soup.find("div", {"id": "table-wrapper"})