BeautifulSoup4 Scrape Pages Which URL Does Not Change [Python] - python

So I want to scrape all the dates from the clash of stats, and there are multiple pages to it, and when you turn the page, the URL does not change. How do I scrape all the dates on which the player has joined a new clan?
The website:
https://www.clashofstats.com/players/pink-panther-VL029CJ2/history/log
My code now:
from emoji import UNICODE_EMOJI
import requests
from bs4 import BeautifulSoup
link = f'https://www.clashofstats.com/players/{"Pink Panther"}-{str("#VL029CJ2").replace("#", "")}/history/log'
link = link.replace("#", "%2523")
link = link.replace("#", "%2540")
link = link.replace(" ", "-")
print(link)
for i in name:
if i in UNICODE_EMOJI:
link = link.replace(i, "")
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
dates = soup.find_all(class_="start date")
print(dates)

You should use Selenium
pip install selenium
and download the Chrome Driver for example.
Then the code will look sth like this:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome('chromedriver.exe', options=options)
link = 'https://www.clashofstats.com/players/pink-panther-VL029CJ2/history/log'
driver.get(link)
while True:
try:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
dates = soup.find_all(class_="start date")
print(dates)
# next_page_link = driver.find_element_by_xpath('path_to_element')
# next_page_link = driver.find_elements_by_class_name('class_name')
print(next_page_link)
next_page_link.click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException) as e:
print("Last page reached")
break
driver.quit()
But you need to find XPath to element, or locate the element via class name
Locate elements in Selenium

Related

Get 'None' from website by using beautifulSoup

I am new joiner and doing self study for crawling. I tried to get the information from Disneyland
https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j
I tried to crawl the price from the website, but it return "None", the result should be HK$639
url5 = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
r = requests.get(url5)
sp = BeautifulSoup(r.content, 'html.parser')
door = sp.find('div', class_='container')
price = door.find('p', class_='price')
print(price)
My concept of beautifulSoup: It parse the website to be html and I can use find/find_all to find the information by using 'div', 'p', and by its class. Please correct me if it is wrong, thank you.
The page is loaded by JavaScript.So to pull out the desired data, you can use selenium with bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service,options=options)
url = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
driver.get(url)
#driver.maximize_window()
time.sleep(10)
soup=BeautifulSoup(driver.page_source, 'lxml')
price = soup.select_one('p:-soup-contains("General Admission:") > strong').text
print(price)
Output:
HK$639

How to scrap google hot trend

I am trying to scrap Google Hot Trends. I tried to run Chrome developer tools to capture all requests, but it seems there are no requests in or out. So I tried to use selenium, But I could not get the data due to many reasons the data is variable and change constantly. Here is the code I tried:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
Your code looks correct.
The only point I see here you are missing is: you have to extract the texts from web elements you get here.
Also I'd prefer printing the texts one by one, not all the array at once.
As following:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
I managed to solve the problem with the following code:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)

Pulling all text (multple p tags) with BeautifulSoup and Selenium returns []

I am trying to pull the p tag comments within a review card, eventually looping through a search on vivino.com through this link using BeautifulSoup and Selenium. I was able to open the first link but pulling the p text in the review boxes returns [].
url = "https://www.vivino.com/explore?e=eJwNyTEOgCAQBdHbbA2F5e-8gbE2uKyERBYCaOT20swrJlVYSlFhjaHkPixTHtg34pmVyvzhwutqlO5uyid8bJwf7UeRyqKdMrw0pgYdPwIzGwQ="
driver = webdriver.Chrome('/Users/myname/Downloads/chromedriver')
driver.implicitly_wait(30)
driver.get(url)
python_button = driver.find_element_by_class_name('anchor__anchor--2QZvA')
python_button.click()
soup = BeautifulSoup(driver.page_source, 'lxml')
print(soup.find_all('p'))
table = soup.findAll('div',attrs={"class":"reviewCard__reviewContainer--1kMJM"})
print(table)
driver.quit()
Could anybody advise on the correct way to pull the comments? Since there are more than 1 comment per page would I need to loop?
I also tried this with 'html.parser' instead of 'lxml'. Which is the correct one to use?
Thank you so much for your help.
Here is what you need to do:
import atexit
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_all_elements_located
from selenium.webdriver.support.wait import WebDriverWait
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def find_elements(driver, locator):
return WebDriverWait(driver, 10, 2).until(visibility_of_all_elements_located(locator))
URL = "https://www.vivino.com/explore?e=eJwNyTEOgCAQBdHbbA2F5e-8gbE2uKyERBYCaOT20swrJlVYSlFhjaHkPixTHtg34pmVyvzhwutqlO5uyid8bJwf7UeRyqKdMrw0pgYdPwIzGwQ="
RESULTS = By.CSS_SELECTOR, "div[class*='vintageTitle'] > a"
def main():
driver = start_driver()
driver.get(URL)
# note the results
wines = []
for element in find_elements(driver, RESULTS):
link = element.get_attribute("href")
name = element.find_element_by_css_selector("span[class*='vintageTitle__wine']").text
wines.append((name, link))
pprint(wines)
# go extract details from each result's page
for name, link in wines:
print("getting comments for wine: ", name)
driver.get(link)
# you can do the rest ;)
if __name__ == '__main__':
main()

Is this website scrape-able with BeautifulSoup?

I would to scrape this website : https://www.projets-environnement.gouv.fr/pages/home/
More precisely, I would like to collect the table in the div with id = table-wrapper.
My trouble is that I can't catch it with BeautifulSoup.
Here is my code :
url = 'https://www.projets-environnement.gouv.fr/pages/home/'
html = requests.get(url).text
soup = BeautifulSoup(html, "html5lib")
div_table = soup.findAll('div', id_='table-wrapper')
But div_table is a None object.
Is Selenium the solution ?
I think you should use selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
url = 'https://www.projets-environnement.gouv.fr/pages/home/'
options = Options()
options.headless = True
driver = webdriver.Firefox(firefox_options=options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
mytable = soup.find('div', id='table-wrapper')
and you get that table.
The correct way to call is:
soup.find("div", {"id": "table-wrapper"})

selenium webdriver: I want to click on the next page till last page

from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'https://curecity.in/vendor-list.php?category=Doctor&filters_location=Jaipur&filters%5Bsubareas_global%5D=&filters_speciality='
driver = webdriver.Chrome('C:\chromedriver.exe')
driver.get(url)
driver.maximize_window()
next_page_number=1
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
for link in soup.find_all('div',class_='col-md-9 feature-info'):
link1 = link.find('a')
print(link1['href'])
try:
driver.find_element_by_link_text(">").click()
next_page_number+=1
time.sleep(1)
except:
print ('No more pages')
next_page=False
driver.close()
i am trying to click on the next page but the code written above is clicking on the alternate pages..I want to click from first page till last page.
I found two problems on this page.
1) it loads page very slow so I had to sleep even 10 seconds before get data and click() button.
2) button > works different then I expect - it jumps 3 pages (even if I click it manually in browser) so I search button with number of next page and click it.
driver.find_element_by_xpath('//a[#data-page="{}"]'.format(next_page_number)).click()
Full code. Works even without BeautifulSoup
from selenium import webdriver
#from bs4 import BeautifulSoup as bs
import time
url = 'https://curecity.in/vendor-list.php?category=Doctor&filters_location=Jaipur&filters%5Bsubareas_global%5D=&filters_speciality='
driver = webdriver.Chrome('C:\chromedriver.exe')
#driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
next_page_number = 1
while True:
print('page:', next_page_number)
time.sleep(10)
#soup = bs(driver.page_source, 'html.parser')
#for link in soup.find_all('div',class_='col-md-9 feature-info'):
# link1 = link.find('a')
# print(link1['href'])
for link in driver.find_elements_by_xpath('//div[#class="col-md-2 feature-icon"]/a'):
print(link.get_attribute('href'))
try:
next_page_number += 1
driver.find_element_by_xpath('//a[#data-page="{}"]'.format(next_page_number)).click()
except:
print('No more pages')
break # exit loop
#driver.close()

Categories