On one of the sites for which I am writing a parser, I encountered the following problem:
I need to take all the data from the table, but they are not signed in the html code and are swapped
html example
The table looks like this:
table
At first I used XPATH for this, but when parsing, I found that some data was swapped, such as engine and registration number, or not at all. So XPATH is not suitable, because data with mileage can get into the line with the engine in the csv file
Is it possible somehow in selenium or through bs4 to first search for a word, and then parse the data after it?
That is, what would find the word Engine in the html code, and then take the data below
html text that I need
My code:
import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
def collect_data():
global driver
options = webdriver.ChromeOptions()
options.set_preference('general.useragent.override',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
'Safari/537.36')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# Background mode
# options.add_argument('headless')
try:
driver = webdriver.Chrome(options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
driver.get(
url='https://www.nettiauto.com/en/ford/mustang?yfrom=1980'
)
time.sleep(10)
'''Collect all URLs'''
soup = BeautifulSoup(driver.page_source, 'lxml')
car_url_list = []
total_page = soup.find('span', class_='totPage').text
print('Ford Mustang')
print(f'Total pages: {total_page}')
print(f'Page 1 of {total_page} URL collected')
r = (int(total_page) + 1)
count = 1
for i in range(1, r, 1):
driver.get(
url=f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}'
)
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
car_cards = soup.find_all('a', class_='tricky_link')
count += 1
print(f'Page {count} of {total_page} URL collected')
for car_ulr in car_cards:
car_ulr = car_ulr.get('href')
car_url_list.append(car_ulr)
with open('ford_mustang_url.txt', 'w', encoding='utf8') as file:
for line in car_url_list:
file.write(f'{line}\n')
count = 0
row = []
'''Collect car's data'''
with open('ford_mustang_url.txt', encoding='utf8') as f:
r = len(car_url_list)
print('Total cars: ' + str(r))
for i in range(r):
driver.get(f.readline())
driver.implicitly_wait(30)
soup = BeautifulSoup(driver.page_source, 'lxml')
count += 1
'''Car Data'''
car_name = soup.find('title').text.replace('Nettiauto', '').replace('-', '').replace('Used vehicle', '').replace('Vaihtoauto', '').replace(' ', ' ').strip()
car_price = soup.find('span', class_='GAPrice').find('span').text
car_year = soup.find('div', class_='mid_border').get('data-year')
car_mileage = soup.find('div', class_='mid_border').get('data-mileage')
car_reg_number = soup.find('div', class_='rekkari-banner__body_input').text.strip()
car_url = soup.find('link', hreflang='en').get('href')
# car_engine
'''If section'''
if car_reg_number == 'ABC-123':
car_reg_number = None
if car_mileage == '100000000':
car_mileage = None
print(f'{count}. ' + car_name)
print('Price: ' + f'{car_price}')
print('Year: ' + f'{car_year}')
print('Mileage: ' + f'{car_mileage}')
print('Reg.Number: ' + f'{car_reg_number}')
print('URL: ' + f'{car_url}\n')
data = {
'Name': car_name,
'Price': car_price,
'Year': car_year,
'Mileage': car_mileage,
'Reg.Number': car_reg_number,
'URL': car_url,
}
row.append(data)
csv_title = ['Name', 'Price', 'Year', 'Mileage', 'Reg.Number', 'URL']
with open('ford_mustang.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=csv_title)
writer.writeheader()
writer.writerows(row)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
def main():
collect_data()
if __name__ == '__main__':
main()
I have find some solution with selenium by using if else:
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[1]').text
if car_engine == 'Engine':
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[2]').text.split(" ", 2)[0]
else:
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[1]/td[5]').text.split(" ", 2)[0]
For Drive type it doesn't work, so I did this...
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
pass
Here is a solution for your problem, not based on selenium (it's not the right tool for this job), which will produce a dataframe/csv with all the details you're after:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
soup = BeautifulSoup(r.text, 'html.parser')
car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
for link in car_links:
urls_list.append(link)
for url in tqdm(urls_list):
r = scraper.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
dfs = pd.read_html(str(r.text))
df_list = []
title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
df_list.append(('make_model', title))
df_list.append(('variant', subtitle))
for i, row in dfs[0].iterrows():
df_list.append((row[0], row[1]))
df_list.append((row[3], row[4]))
correct_df = pd.DataFrame(df_list).T
new_header = correct_df.iloc[0]
correct_df = correct_df[1:]
correct_df.columns = new_header
big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')
A couple of notes: first 2 cars descriptions are in Finnish, the rest are in English, so the end df/csv will be a bit funny, but the data will be there. Also, you might get some warnings in the terminal about pd append/use concat, but those are just warnings, the program will run.
You can install cloudscraper with pip install cloudscraper, and tqdm with pip install tqdm. Of course, if you're keen on using Selenium, you can apply the same methods on html obtained from selenium.
Related
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
The website is giving me same results for different urls scraped. I guess the reason for this is that selenium is not letting the website load completely before producing the result. I wrote my code using beautiful soup first but according to SO community, selenium had to be used to get the final webpage to scrape. I implemented selenium to scrape the data and beautiful soup for parsing the data but still the same problem persists.
The code is given below:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
chrome_options = Options()
chrome_options.add_argument("--headless") # Opens the browser up in background
driver = webdriver.Chrome()
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
browser.implicitly_wait(10)
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2+'/batsmen.csv')
get_bowler(date).to_csv(newpath2+'/bowler.csv')
get_allrounder(date).to_csv(newpath2+'/allrounder.csv')
I will be eternally grateful to anyone who could help
Using another method may help, try the following
WebDriverWait(browser, delay)
Refer to this Answer
use browser.implicitly_wait(10) before defining html
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
chrome_options = Options()
chrome_options.add_argument("--headless") # Opens the browser up in background
driver = webdriver.Chrome()
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
with Chrome(options=chrome_options) as browser:
browser.get(url)
#time.sleep(15)#it will wait for page to load, remove '#' if it does not works
browser.implicitly_wait(10)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
try:
player_list.append(player_name.text)
except AttributeError:
continue
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
try:
player_list.append(player_name.text)
except AttributeError:
continue
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
try:
player_list.append(player_name.text)
except AttributeError:
continue
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
date = date.strftime("%Y-%m-%d")
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2+'/batsmen.csv')
get_bowler(date).to_csv(newpath2+'/bowler.csv')
get_allrounder(date).to_csv(newpath2+'/allrounder.csv')
How do you scrape a web page with infinite scrolling?
My first try was using Selenium, but it detects as robot:
from selenium import webdriver
import time
import pandas as pd
url = 'https://www.bloomberg.com/search?query=indonesia%20mining'
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options)
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0
while driver.find_elements_by_css_selector('.contentWell__a8d28605a5'):
driver.find_element_by_css_selector('.contentWell__a8d28605a5').click()
page_num += 1
print("getting page number "+str(page_num))
time.sleep(1)
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
titles = soup.find_all('div', {"class":"text__d88756958e withThumbnail__c4ffc902a6"})
df = pd.DataFrame(columns=['judul', 'link'])
news = {}
for t in titles:
news['judul'] = t.find('a', {'class':'headline__96ba1917df'}).text.strip()
news['link'] = t.find('a', {'class':'headline__96ba1917df'}).get('href')
df = df.append(news, ignore_index=True)
any idea how to limit the maximum page number?
Here's the website : website
And here's my script :
from selenium import webdriver
import time
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium.webdriver.common.keys import Keys
#chemin du folder ou vous avez placer votre chromedriver
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
#driver = webdriver.Chrome(options=options, executable_path=PATH)
url = 'https://www.booking.com/hotel/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625222475;srpvid=48244b2523010057;type=total;ucfs=1&#tab-main'
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get(url)
driver.maximize_window()
time.sleep(2)
headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
cookie = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
try:
cookie.click()
except:
pass
time.sleep(2)
country = driver.find_element_by_xpath('//*[#class="hp_nav_reviews_link toggle_review track_review_link_zh"]')
country.click()
time.sleep(2)
url2 = driver.current_url
commspos = []
commsneg = []
header = []
notes = []
dates = []
datestostay = []
results = requests.get(url2, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
reviews = soup.find_all('li', class_ = "review_item clearfix")
for review in reviews:
try:
commpos = review.find("p", class_ = "review_pos").text.strip()
except:
commpos = 'NA'
commspos.append(commpos)
try:
commneg = review.find("p", class_ = "review_neg").text.strip()
except:
commneg = 'NA'
commsneg.append(commneg)
head = review.find('div', class_ = 'review_item_header_content').text.strip()
header.append(head)
note = review.find('span', class_ = 'review-score-badge').text.strip()
notes.append(note)
date = review.find('p', class_ = 'review_item_date').text[23:].strip()
dates.append(date)
try:
datestay = review.find('p', class_ = 'review_staydate').text[20:].strip()
datestostay.append(datestay)
except:
datestostay.append('NaN')
data = pd.DataFrame({
'commspos' : commspos,
'commsneg' : commsneg,
'headers' : header,
'notes' : notes,
'dates' : dates,
'datestostay' : datestostay,
})
data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')
My script goes here :
But my csv file as output is empty. I assume it has to do with some kind of javascript. I already encounter that, the script goes to the subpage but isn't really into the subpage, hence doesn't have access to the html part and give nothing in the output.
That sub-page that you want loads the data from this URL.
https://www.booking.com/hotelfeaturedreviews/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=22417257c7da25395d270bcc7c6ec2e8;dest_id=-1456928;dest_type=city;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625222475;srpvid=48244b2523010057;type=total;ucfs=1&&_=1625476042625
You can easily scrape this page and extract the data you need.
My intention is to get the full review of all the profiles along with title of the review, user name, location of the user and the time of posting from the reliance jio reviews web pages of the website and store it in a CSV file.
The website I want to crawl is http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061
When I tried storing the scraped data of first two pages in a CSV file, I got the below output. My problem is the output in each line generate more columns than desired. One sentence is parsed into many cells.
My code :
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
for items1 in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link1 = items1.find_element_by_css_selector(".reviewdata a")
link1.click()
time.sleep(2)
csv = open('index.csv','w')
column = "Name,Location,Review_data,Review_title,Review_data\n"
csv.write(column)
soup1 = BeautifulSoup(driver.page_source,"lxml")
for item1 in soup1.select(".review-article"):
name1 = item1.select("p a")[0].text
location1 = item1.select("p")[1].text
review_date1 = item1.select("small")[0].text
review_title1 = item1.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data1 = ' '.join([' '.join(items1.text.split()) for items1 in item1.select(".reviewdata")])
print("Name: {}\nLocation : {}\nReview_date: {}\nReview_Title: {}\nReview_Data: {}\n".format(name1, location1, review_date1, review_title1, review_data1))
csv1 = open('index.csv','a')
page1_data = name1 + "," + location1 + "," + review_date1 + "," + review_title1 + "," + review_data1 + "\n"
csv1.write(page1_data)
uclient=uReq(url)
page_html=uclient.read()
uclient.close()
page_soup = soup(page_html,"html.parser")
container = soup.find("ul",{"class":"pages table"})
all_li = container.findAll("li")
last_div = None
for last_div in all_li:pass
if last_div:
content = last_div.getText()
content1 = int(content)
container1 = soup.findAll("li",{"class":"next"})
li=container1[0].find("a",{"class":"btn btn-link"}).attrs['href']
driver.get(li)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nLocation : {}\nReview_date: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, location, review_date, review_title, review_data))
csv2 = open("index.csv",'a')
page2_data = name +","+ location+"," + review_date +","+ review_title +","+ review_data + "\n"
csv2.write(page2_data)
driver.quit()
I need help to figure out the error in my code for storing the scraped data into the CSV file in a structured manner.
View your csv file in a text editor. The problem is your spreadsheet program is parsing on BOTH commas and spaces.
Another problem is that you haven't accounted for commas WITHIN your scraped data. That is why you have the city and country in different cells. You will need to put quotation marks around values that have commas within them.
See
page1_data = name1 + "," + location1 + "," + review_date1 + "," + review_title1 + "," + review_data1 + "\n"
csv1.write(page1_data)
There is already comma used in, say, location: Delhi, India. If you keep using comma like you did above, the csv file cannot be parsed properly.
**One workaround is to put "" around your text containing comma. So the Delhi, India would be turned into "\Delhi, India" after this step. **
def preprocess(text):
if "," in text:
return '"' + text + '"'
return text
Wrap each of your text with the function.
page1_data = preprocess(name1) + "," + preprocess(location1) + "," + preprocess(review_date1) + "," + preprocess(review_title1) + "," + preprocess(review_data1) + "\n"
This should work.
Another way should be changing the delimiter to other characters.
You should use csv module because it will automatically resolve problem with comma and "new line"/enter in text.
Create csv writer
f = open('index.csv','w')
csv_writer = csv.writer(f)
and write headers using list, not single string
column = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(column)
the same way write list with data
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
Full code
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
# --- init ---
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
# --- open file ---
f = open("index.csv", "w")
csv_writer = csv.writer(f)
columns = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(columns)
# ---- get data ---
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, "lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- get next url ---
uclient = uReq(url)
page_html = uclient.read()
uclient.close()
soup = BeautifulSoup(page_html, "html.parser")
container = soup.find("ul", {"class": "pages table"})
all_li = container.findAll("li")
if all_li:
last_div = all_li[-1]
content = last_div.getText()
content = int(content)
container = soup.findAll("li", {"class": "next"})
li = container[0].find("a", {"class": "btn btn-link"}).attrs['href']
# ---- get data ---
driver.get(li)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, "lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- end ---
driver.quit()
f.close()
EDIT: version without beautifulsoup and requests - only selenium
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
def get_data(driver, csv_writer):
for item in driver.find_elements_by_css_selector(".review-article"):
name = item.find_elements_by_css_selector("p a")[0].text
location = item.find_elements_by_css_selector("p")[1].text
review_date = item.find_elements_by_css_selector("small")[0].text
review_title = item.find_elements_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = item.find_elements_by_css_selector(".reviewdata")
review_data = ' '.join([' '.join(items.text.split()) for items in review_data])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- init ---
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
# --- open file ---
f = open("index.csv", "w")
csv_writer = csv.writer(f)
columns = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(columns)
# ---- get data ---
print('url:', url)
driver.get(url)
wait = WebDriverWait(driver, 10)
get_data(driver, csv_writer)
# --- get next url ---
url = driver.find_element_by_xpath('//li[#class="next"]/a').get_attribute("href")
# ---- get data ---
print('url:', url)
driver.get(url)
wait = WebDriverWait(driver, 10)
get_data(driver, csv_writer)
# --- end ---
driver.quit()
f.close()