On one of the sites for which I am writing a parser, I encountered the following problem:
I need to take all the data from the table, but they are not signed in the html code and are swapped
html example
The table looks like this:
table
At first I used XPATH for this, but when parsing, I found that some data was swapped, such as engine and registration number, or not at all. So XPATH is not suitable, because data with mileage can get into the line with the engine in the csv file
Is it possible somehow in selenium or through bs4 to first search for a word, and then parse the data after it?
That is, what would find the word Engine in the html code, and then take the data below
html text that I need
My code:
import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
def collect_data():
global driver
options = webdriver.ChromeOptions()
options.set_preference('general.useragent.override',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
'Safari/537.36')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# Background mode
# options.add_argument('headless')
try:
driver = webdriver.Chrome(options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
driver.get(
url='https://www.nettiauto.com/en/ford/mustang?yfrom=1980'
)
time.sleep(10)
'''Collect all URLs'''
soup = BeautifulSoup(driver.page_source, 'lxml')
car_url_list = []
total_page = soup.find('span', class_='totPage').text
print('Ford Mustang')
print(f'Total pages: {total_page}')
print(f'Page 1 of {total_page} URL collected')
r = (int(total_page) + 1)
count = 1
for i in range(1, r, 1):
driver.get(
url=f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}'
)
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
car_cards = soup.find_all('a', class_='tricky_link')
count += 1
print(f'Page {count} of {total_page} URL collected')
for car_ulr in car_cards:
car_ulr = car_ulr.get('href')
car_url_list.append(car_ulr)
with open('ford_mustang_url.txt', 'w', encoding='utf8') as file:
for line in car_url_list:
file.write(f'{line}\n')
count = 0
row = []
'''Collect car's data'''
with open('ford_mustang_url.txt', encoding='utf8') as f:
r = len(car_url_list)
print('Total cars: ' + str(r))
for i in range(r):
driver.get(f.readline())
driver.implicitly_wait(30)
soup = BeautifulSoup(driver.page_source, 'lxml')
count += 1
'''Car Data'''
car_name = soup.find('title').text.replace('Nettiauto', '').replace('-', '').replace('Used vehicle', '').replace('Vaihtoauto', '').replace(' ', ' ').strip()
car_price = soup.find('span', class_='GAPrice').find('span').text
car_year = soup.find('div', class_='mid_border').get('data-year')
car_mileage = soup.find('div', class_='mid_border').get('data-mileage')
car_reg_number = soup.find('div', class_='rekkari-banner__body_input').text.strip()
car_url = soup.find('link', hreflang='en').get('href')
# car_engine
'''If section'''
if car_reg_number == 'ABC-123':
car_reg_number = None
if car_mileage == '100000000':
car_mileage = None
print(f'{count}. ' + car_name)
print('Price: ' + f'{car_price}')
print('Year: ' + f'{car_year}')
print('Mileage: ' + f'{car_mileage}')
print('Reg.Number: ' + f'{car_reg_number}')
print('URL: ' + f'{car_url}\n')
data = {
'Name': car_name,
'Price': car_price,
'Year': car_year,
'Mileage': car_mileage,
'Reg.Number': car_reg_number,
'URL': car_url,
}
row.append(data)
csv_title = ['Name', 'Price', 'Year', 'Mileage', 'Reg.Number', 'URL']
with open('ford_mustang.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=csv_title)
writer.writeheader()
writer.writerows(row)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
def main():
collect_data()
if __name__ == '__main__':
main()
I have find some solution with selenium by using if else:
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[1]').text
if car_engine == 'Engine':
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[2]').text.split(" ", 2)[0]
else:
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[1]/td[5]').text.split(" ", 2)[0]
For Drive type it doesn't work, so I did this...
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
pass
Here is a solution for your problem, not based on selenium (it's not the right tool for this job), which will produce a dataframe/csv with all the details you're after:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
soup = BeautifulSoup(r.text, 'html.parser')
car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
for link in car_links:
urls_list.append(link)
for url in tqdm(urls_list):
r = scraper.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
dfs = pd.read_html(str(r.text))
df_list = []
title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
df_list.append(('make_model', title))
df_list.append(('variant', subtitle))
for i, row in dfs[0].iterrows():
df_list.append((row[0], row[1]))
df_list.append((row[3], row[4]))
correct_df = pd.DataFrame(df_list).T
new_header = correct_df.iloc[0]
correct_df = correct_df[1:]
correct_df.columns = new_header
big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')
A couple of notes: first 2 cars descriptions are in Finnish, the rest are in English, so the end df/csv will be a bit funny, but the data will be there. Also, you might get some warnings in the terminal about pd append/use concat, but those are just warnings, the program will run.
You can install cloudscraper with pip install cloudscraper, and tqdm with pip install tqdm. Of course, if you're keen on using Selenium, you can apply the same methods on html obtained from selenium.
How do you scrape a web page with infinite scrolling?
My first try was using Selenium, but it detects as robot:
from selenium import webdriver
import time
import pandas as pd
url = 'https://www.bloomberg.com/search?query=indonesia%20mining'
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options)
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0
while driver.find_elements_by_css_selector('.contentWell__a8d28605a5'):
driver.find_element_by_css_selector('.contentWell__a8d28605a5').click()
page_num += 1
print("getting page number "+str(page_num))
time.sleep(1)
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
titles = soup.find_all('div', {"class":"text__d88756958e withThumbnail__c4ffc902a6"})
df = pd.DataFrame(columns=['judul', 'link'])
news = {}
for t in titles:
news['judul'] = t.find('a', {'class':'headline__96ba1917df'}).text.strip()
news['link'] = t.find('a', {'class':'headline__96ba1917df'}).get('href')
df = df.append(news, ignore_index=True)
any idea how to limit the maximum page number?
It tried but it could not print all names , images , prices from https://www.skechers.com/women/shoes/athletic-sneakers/?start=0&sz=168
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
url = 'https://www.skechers.com/women/shoes/athletic-sneakers/?start=0&sz=168'
driver = webdriver.Chrome('D:/chromedriver')
driver.get(url)
vi = driver.find_elements_by_class_name('col-6 col-sm-4 col-xl-3 mb-2 mb-md-1 mb-lg-4 px-lg-3')
for vit in vi:
title = video.find_elements_by_xpath('//a[#class = "link c-product-tile__title"]')[0].text
image = video.find_elements_by_xpath('tile-image c-product-tile__img')[0].text
price = video.find_elements_by_xpath('//span[1][#class = "value"]')[0].text
print(title,image,price)
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
url = 'https://www.skechers.com/women/shoes/athletic-sneakers/?start=0&sz=168'
driver = webdriver.Chrome('D:/chromedriver')
driver.get(url)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
content= soup.find_all('div',class_='col-6 col-sm-4 col-xl-3 mb-2 mb-md-1 mb-lg-4 px-lg-3')
skechersshoes=[]
for item in content:
patitle = item.find('div', class_='pdp-link c-product-tile__title__wrap')
title =patitle.find('a',class_ ='link c-product-tile__title').text
gender = item.find('div',class_='c-product-tile__gender').text
gender= gender[1:-1]
sprice = item.find('div',class_ ='price')
sbprice = sprice.find('span',class_ ='sales')
price = sbprice.find('span',class_= 'value').text
price= price[1:-1]
links = item.find('a',{'class': 'link c-product-tile__title'})['href']
try:
aexclusive= item.find('div', class_ = 'image-container c-product-tile__image-container')
exclusive =item.find('span', class_ = 'c-product-tile__badge badge badge-primary').text
except:
exclusive=''
exclusive= exclusive[1:-1]
try:
color =item.find('div', class_ = 'c-product-tile__color-swatches__label').text
except:
color=''
color= color[1:-1]
try:
promotion =item.find('div', class_ = 'promotion').text.strip()
except:
promotion=''
promotion= promotion[1:-1]
print(title,gender, price,links,exclusive,color,promotion)
skechers={
'productname':title,
'Gender':gender,
'product_color':color,
'product_price': price,
'promotion': promotion,
'exclusive': exclusive,
'links': links,
}
skechersshoes.append(skechers)
df = pd.DataFrame(skechersshoes)
print(df.head())
df.to_csv('skechers.csv')
i am trying to extract the table of qualifications in scope but i am having hard time doing it since its my first time. can anyone please help me
url of the website i am scraping : https://training.gov.au/Organisation/Details/31102
enter code here
import re
import csv
import time
from pathlib import Path
import details as details
from selenium import webdriver
import bs4 as bs4
import os
import copy
option = webdriver.ChromeOptions()
option.add_argument(" - incognito")
option.add_argument("headless")
exec_path = '/Users/Downloads/MUR_scraping-master/Libraries/chromedriver'
browser = webdriver.Chrome(executable_path=exec_path, options=option)
# read the url from each file into a list
course_links_file_path = Path(os.getcwd().replace('\\', '/'))
course_links_file_path = course_links_file_path.__str__() + '/links.txt'
course_links_file = open(course_links_file_path, 'r')
# the csv file we'll be saving the courses to
csv_file_path = Path(os.getcwd().replace('\\', '/'))
csv_file = csv_file_path.__str__() + '/Reading_undergraduate.csv'
for each_url in course_links_file:
# print(each_url)
try:
browser.get(each_url)
except:
print(each_url)
pass
pure_url = each_url.strip()
each_url = browser.page_source
delay_ = 15
soup = bs4.BeautifulSoup(each_url, 'lxml')
desc_div = soup.find('div', class_='t-content t-state-active')
if desc_div:
desc_list = []
desc_p_list = desc_div.find_all(class_='display-row')
if desc_p_list:
for p in desc_p_list:
desc_list.append(p.get_text())
desc_list = ' '.join(desc_list)
#print(desc_list)
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
Not my best code but the following scraps the table into a 2D array. My solution is a bit sloppy but I hope this is something you can work with.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import bs4 as bs4
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
option.add_argument("--headless")
exec_path = "TODO: Add your path"
browser = webdriver.Chrome(executable_path=exec_path, options=option)
browser.get("https://training.gov.au/Organisation/Details/31102")
# open the scope tab
browser.find_element_by_css_selector('a#detailsScopeTab').click()
# wait for the table to load
element = WebDriverWait(browser, 20).until(
EC.presence_of_element_located(
(By.XPATH, '//div[#id="ScopeQualification"]//div[#class="t-pagesize-wrapper"]')))
# click on the button to have all rows at once
browser.find_element_by_xpath('//div[#id="ScopeQualification"]//div[#class="t-pagesize-wrapper"]/a[last()]').click()
# wait for the table loads the new data
class element_css_class_flashed(object):
def __init__(self, locator, css_class):
self.locator = locator
self.css_class = css_class
self.was_on = False
def __call__(self, driver):
element = driver.find_element(*self.locator)
if self.css_class in element.get_attribute("class"):
self.was_on = True
elif self.was_on:
return element
else:
return False
try:
wait = WebDriverWait(browser, 3)
element = wait.until(element_css_class_flashed(
(By.XPATH, '//div[#id="ScopeQualification"]//div[#class="t-status"]/a'),
"t-loading"))
except:
# most likely the loading was too fast to detect
pass
soup = bs4.BeautifulSoup(browser.page_source, 'html.parser')
table = soup.select_one('div#ScopeQualification table')
all_rows = table.find_all('tr')
header_row = all_rows[0]
rows = all_rows[1:-1]
data = [[col.text for col in header_row.find_all('th')]]
for row in rows:
data_row = []
for col in row.find_all('td'):
data_row.append(col.text)
data.append(data_row)
print(data)
This code will crawl again from the beginning every time an error occurs. I want to change this to crawl only new text, not just from the beginning.
and I would like to ask for further advice.
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body- content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10):
try:
for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('what i can do?')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
driver.close()
And I wonder if this is a problem with phantom js