anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
Related
I'm trying to scrape this page
https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife
I scraped the first page this website and click with selenium to next page, but I only can get the first page content, when I scrape the second, it came the same content from first page. I dunno how to fix this or if the webpage has some protection to scraping.
Could someone help me?
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver
def scrape():
cont = [True,True,True,True,False]
for times in cont:
if times != True:
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
sleep(15)
titles = []
addresses = []
areas = []
rooms = []
bathes = []
values = []
start_time = time()
request = 0
soup = BeautifulSoup(page,'html.parser')
imov = soup.find_all('div', class_='property-card__main-content')
sleep(randint(8,15))
# Monitor
request += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if page.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, page.status_code))
# Break the loop if the number of requests is greater than expected
if request > 72:
warn('Number of requests was greater than expected.')
break
for container in imov:
# Título
title = container.h2.a.get_text()
t2 = title.strip()
titles.append(t2)
# Título
# Endereço
address = container.h2.span.get_text()
a2 = address.strip()
addresses.append(a2)
# Endereço
# Área
area = container.li.span.get_text()
ar2 = area.strip()
areas.append(ar2)
# Área
# Quartos
room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
r2 = room2.strip()
rooms.append(r2)
# Quartos
# Banheiros
bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
b2 = bath2.strip()
bathes.append(b2)
# Banheiros
# Valor
value = container.section.div.get_text()
v2 = value.strip()
values.append(v2)
# Valor
# Dataframe e salvar
vivareal = pd.DataFrame({
"title": titles,
"address": addresses,
"area": areas,
"rooms":rooms,
"baths":bathes,
"value":values
})
vivareal.to_csv(r'output.csv')
prox = driver.find_element_by_xpath('//*[#title="Próxima página"]')
prox.click()
else:
print('Done!')
scrape()```
Although you put the click command at the end, when it goes to the next loop, the first command is to create a new driver and then is called the command to get the main page of Viva Real to Pernambuco. This is unwanted. Instead of this you could do:
def scrape():
cont = [True,True,True,True,False]
# You create the driver and access the main page only once
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
for times in cont:
if times != True:
# Wait to load every page
sleep(15)
Your code is not working as espected, even with the fixes provided by #MarceloBaliu. Here is my code that (finally!) worked for me. I'm sharing because it can help someone, like I was helped by this website.
from selenium import webdriver
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
class ScraperVivaReal:
wait_time = 5
def __init__(self, url):
# Initializing the webdriver
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
self.driver = webdriver.Firefox(options=options)
self.driver.maximize_window()
self.driver.get(url)
time.sleep(self.wait_time)
# Handling cookies acception
WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="cookie-notifier-cta"]'))).click()
time.sleep(self.wait_time/2)
def __scrape_page__(self):
result = []
# Extracting data from the page
try:
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
except WebDriverException:
print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
return result
# Finding property cards containing search results
div_list = soup.find_all('div', {'class':'property-card__content'})
# Iterating each card
for d in div_list:
# Extracting info from card
title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
# Extracting the price
try:
price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
except AttributeError:
price = "N/I"
# Splitting the address
add_list = re.split(',|-', complete_address)
add_list = [ item.strip() for item in add_list ]
if len(add_list) == 2:
city, st = add_list
neibhood = 'N/I'
address = 'N/I'
number = 'N/I'
if len(add_list) == 3:
neibhood, city, st = add_list
address = 'N/I'
number = 'N/I'
if len(add_list) == 4:
address, neibhood, city, st = add_list
number = 'N/I'
elif len(add_list) == 5:
address, number, neibhood, city, st = add_list
# Adding the result into a dicionary and appending the dict to a result list
row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
result.append(row)
return result
def __next_page__(self):
# Finding the "Next Page" button element
next_element = self.driver.find_element_by_xpath('//*[#title="Próxima página"]')
try:
# Trying to click it
next_element.click()
time.sleep(self.wait_time)
return True
# Treating some exceptions (element not found and element not clickable)
except ElementClickInterceptedException:
print('"Próxima Página" element is not clickable!')
except NoSuchElementException:
print('"Próxima Página" element not found!')
return False
def run(self, output):
has_next = True
final_result = []
# Getting the information!
while has_next:
results = self.__scrape_page__()
final_result.extend(results)
print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
if len(results) == 0:
break
has_next = self.__next_page__()
# Quitting Firefox
self.driver.quit()
# Exporting results to CSV
df = pd.DataFrame(final_result)
df.to_csv(output, sep=',')
S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
S.run('output.csv')
I am trying to start a new thread for each page, but this way it starts a new thread after the other thread/function is finished.
Can anyone help me run them independent of each other?
Example:
Thread 1:
Open page 1
Thread 2:
Open page 2
And do this for X amount of pages.
I am a beginner in python so excuse my messy code.
import random
import string
import threading
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
# driver.find_element_by_css_selector("a[onclick*='if (!window.__cfRLUnblockHandlers) return false; bail()']")
def randomStringDigits(stringLength=6):
"""Generate a random string of letters and digits """
lettersAndDigits = string.ascii_letters + string.digits
return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
def startscrape(url):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.get(page)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xxx = soup.findAll("a", {"class": "js-lbImage"})
# find all thumbs
for link in xxx:
xxx = soup.find("a", {"href": link.get('href')})
dlfullimg = driver.find_element_by_xpath("//a[#href='" + xxx.get('href') + "']")
wait = WebDriverWait(driver, 10)
dlfullimg.click()
thumbs = soup.findAll("div", {"class": "lg-thumb-item"})
dlfullimg = driver.find_element_by_id('lg-download').click()
close = driver.find_element_by_xpath("//span[#class='lg-close lg-icon']").click()
sleep(1)
assert "No results found." not in driver.page_source
url = input("Main URL: ")
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find page number with soup.find
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.close()
threads = []
for i in range(int(xx.get('max'))):
page = url + "page-" + str(i + 1)
t = threading.Thread(target=startscrape(url), args=[])
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
You can use concurrent.futures to handle the heavy lifting for you
Here's a pseudo-code to do it
import concurrent.futures
from selenium import webdriver
def process_url(url):
driver = webdriver.Chrome()
driver.get(url)
# process page
driver.close
# Find number of pages here
driver = webdriver.Chrome()
driver.get(url)
# urls = find list of urls
driver.close
threads_count = 10
with concurrent.futures.ThreadPoolExecutor(threads_count) as executor:
executor.map(process_url, urls)
I am trying to scrape products details from aliexpress. I have 2 questions. First, how do I scrape category and save it in csv file in front of each product and second, how do I move to the 2nd and other pages until there are no more pages available or until page 10.
This is the code I have written to find the next pages
from bs4 import BeautifulSoup
import requests as r
page = r.get('https://www.aliexpress.com/category/200000664/jackets.html?spm=2114.11010108.102.4.650c649b8lfPOb')
soup = BeautifulSoup(page.content,'html.parser')
content = soup.find(id="pagination-bottom")
pages = content.findAll('a')
for i in pages:
print('https:' + i.get('href'))
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup import urllib.request
filename = "alibaba.csv"
f=open(filename, "w")
headers="product_name, price, Rating \n"
f.write(headers)
class alibabascrape(object):
def __init__(self, keyword):
self.keyword = keyword
self.url = f"https://www.aliexpress.com/wholesale?catId=0&initiative_id=&SearchText={keyword}"
self.driver = webdriver.Firefox(executable_path = 'c:\geckodriver.exe')
self.delay = 3
def load_alibabalist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_all_elements_located((By.ID, "form-searchbar")))
print("page is ready")
except TimeoutException:
print("Too much Time")
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name("list-item")
post_title_list = []
for post in all_posts:
title=post.text.split("\n")
name=title[0]
print(name)
price=title[2]
print(price)
rating = title[6]
print(rating)
f.write(name + "," + price + "," + rating + "\n")
post_title_list.append(post.text)
return post_title_list
def extract_category(self):
category = self.driver.find_elements_by_class_name("col-sub")
print(category)
def extract_post_urls(self):
url_list = []
html_page = urllib.request.urlopen(self.url)
soup = BeautifulSoup(html_page, "lxml")
for link in soup.findAll("a", {"class": "history-item product"}):
print(link["href"])
url_list.append(link["href"])
return url_list
keyword = "iphone"
scrapper = alibabascrape(keyword)
scrapper.load_alibabalist_url()
scrapper.extract_post_information()
scrapper.extract_category()
scrapper.extract_post_urls()
I can help you with pagination:
If you get all ref links then you can simply use for loop to iterate
all links.
If you just have prev or next page link. Then use while/ do while loop to check if the link exists and then click on it.
scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])
I'm making a Craigslist scraper to scrape the titles, prices, date, and URL and exported that info to a CSV. Now, I want Selenium to click on the post URL to navigate to the actual page, parse the page to get a span tag "Odometer" (to get mileage), and return that to my CSV file.
Here's my code so far:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
#extracting the post information such as titles, dates, and prices
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
post_info_list = []
for i in range(len(all_posts)):
post = all_posts[i]
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
#print('PRICE: ' + (price))
#print('TITLE: ' + (title))
#print('DATE: ' + date)
lst = [price, title, date]
post_info_list.append(lst)
#f=open("miata_prices.csv", "a+")
#f.write(post_info_list)
#print(post_info_list)
#df = pd.DataFrame(post_info_list)
#df.to_csv('miata_prices.csv', index=False, header=False)
print(post_info_list)
return post_info_list
def save_post_info_and_urls_to_csv(self, post_info, post_urls):
for i in range(len(post_info)):
post_info[i].append(post_urls[i])
#print(post_info)
df = pd.DataFrame(post_info)
df.to_csv('miata_prices.csv', index=False, header=False)
return post_info
#extracting post URLs
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
aTagsInLi = self.driver.find_elements_by_css_selector('li a')
self.driver.find_elements_by_css_selector('li a')[0].click()
for a in aTagsInLi:
link = a.get_attribute('href')
print(link)
link = self.driver.find_element_by_link_text('Miata')
print(link)
link.click()
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
#print(link.get('href'))
url_list.append(link.get('href'))
return url_list
#to click on URL Links and parse the HTML
def click_next_page(self):
href = driver.find_element_by_partial_link_text("result-title hdrlink")
extract_post_urls(url_list).click(href)
def quit(self):
self.driver.close()
location = "sfbay"
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()
I manage to get everything to the CSV file, but I'm stuck on how I can get Selenium to open every link in a new tab, get the odometer information, then close the tab.
I'm using this to build a dataset and eventually do some analysis with it!
I have an example how to get Selenium to open every link and get the odometer information. I used a wrapper for Selenium (SeElements) for less code. I hope you will found out how it works. So:
I'm opening your link, scrapping all links from the titles to the list. Then open every link and trying to get odometer info.
from elementium.drivers.se import SeElements
from selenium import webdriver
browser = webdriver.Chrome()
url = 'https://sfbay.craigslist.org/search/ctaquery=mazda+miata&sort=rel&max_price=6000&auto_transmission=1'
browser.get(url)
se = SeElements(browser)
titles = se.xpath('//p[#class="result-info"]/a', wait=True, ttl=5)
try:
links = []
for link in titles:
links.append(link.attribute('href'))
for link in links:
print(link)
browser.get(link)
try:
odometer = se.xpath('//span[contains(text(), "odometer")]',wait=True, ttl=2).text()
except Exception:
continue
print(odometer)
except Exception as e:
browser.quit()
raise e