scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])
Related
I manage to scrape a lot of information from AirBnB but i have to questions.
This is my code for scraping several information such as price, rating etc.
Imports
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests, re
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
time.sleep(5)
Main code
url = 'https://www.airbnb.com/s/Thessaloniki--Greece/homes?tab_id=home_tab&flexible_trip_lengths%5B%5D=one_week&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJ7eAoFPQ4qBQRqXTVuBXnugk&query=Thessaloniki%2C%20Greece&date_picker_type=calendar&search_type=user_map_move&price_filter_input_type=0&ne_lat=40.66256734970964&ne_lng=23.003752862853986&sw_lat=40.59051931897441&sw_lng=22.892087137145978&zoom=13&search_by_map=true&federated_search_session_id=1ed21e1c-0c5e-4529-ab84-267361eac02b&pagination_search=true&items_offset={offset}§ion_offset=2'
data = []
for offset in range(0,40,20):
driver.get(url.format(offset=offset))
time.sleep(2)
soup=BeautifulSoup(driver.page_source, 'lxml')
detailed_pages = []
for card in soup.select('div[class="c4mnd7m dir dir-ltr"]'):
link = 'https://www.airbnb.com' + card.select_one('a[class="ln2bl2p dir dir-ltr"]')['href']
detailed_pages.append(link)
for page in detailed_pages:
driver.get(page)
time.sleep(3)
soup2=BeautifulSoup(driver.page_source, 'lxml')
room_type = soup2.select_one('div._tqmy57')
room_type = room_type.text if room_type else None
r= requests.get(page)
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
lat = p_lat.findall(r.text)[0]
lng = p_lng.findall(r.text)[0]
room_id = page[29: link.index("?")]
titles = soup2.select_one('span._1n81at5')
titles = titles.text if titles else None
price = soup2.select_one('span._tyxjp1')
price = price.text if price else None
rating= soup2.select_one('span._12si43g')
rating = rating.text if rating else None
Bedroom_area = soup2.select_one('div[class="_1a5glfg"]')
Bedroom_area = Bedroom_area.text if Bedroom_area else None
place_offers= ', '.join([x.get_text(strip=True) for x in soup2.select('[class="sewcpu6 dir dir-ltr"]+div:nth-of-type(3) > div')])
data.append({
'Room_ID':room_id,
'titles':titles,
'place_offers': place_offers,
'price':price,
'rating':rating,
'Bedroom_area': Bedroom_area,
'Room_Type': room_type,
'Latitude':lat,
'Longitude':lng
})
df=pd.DataFrame(data)
df
The first question is how can I click on buttons like amenities, description etc. and scrape them, since in the landing page we just have some information about this but not all the info.
I know that there is a function .click() in sellenium but i am trying the following code:
soup2.select_one('div.b6xigss dir dir-ltr').click()
but I am getting that error: 'NoneType' object has no attribute 'click' .
The second question is how can I scrape the calendar data and which dates are blocked or not ?
There are few problems:
click() works only with Selenium (driver.find_element()) but not with BeautifulSoup (soup2.select_one()) - so first you have to use different function
for some reasons it can't find 'div.b6xigss.dir.dir-ltr' but it finds 'div.b6xigss button' (To make sure I search button because div can be "unclickable")
there is message about cookies and it hides this element and selenium can't click. It would need to close this message (accept cookies), or it would need to scroll page to move button in visible place, or it needs to use JavaScript (driver.execute_script()) to click it.
This works for me
button = driver.find_element(By.CSS_SELECTOR, 'div.b6xigss button')
driver.execute_script('arguments[0].click()', button)
Miniamal working code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
url = 'https://www.airbnb.com/s/Thessaloniki--Greece/homes?tab_id=home_tab&flexible_trip_lengths%5B%5D=one_week&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJ7eAoFPQ4qBQRqXTVuBXnugk&query=Thessaloniki%2C%20Greece&date_picker_type=calendar&search_type=user_map_move&price_filter_input_type=0&ne_lat=40.66256734970964&ne_lng=23.003752862853986&sw_lat=40.59051931897441&sw_lng=22.892087137145978&zoom=13&search_by_map=true&federated_search_session_id=1ed21e1c-0c5e-4529-ab84-267361eac02b&pagination_search=true&items_offset={offset}§ion_offset=2'
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
data = []
for offset in range(0, 40, 20):
print('offset:', offset)
driver.get(url.format(offset=offset))
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'lxml')
detailed_pages = []
for card in soup.select('div[class="c4mnd7m dir dir-ltr"] a[class="ln2bl2p dir dir-ltr"]'):
link = 'https://www.airbnb.com' + card['href']
detailed_pages.append(link)
print('len(detailed_pages):', len(detailed_pages))
for number, page in enumerate(detailed_pages, 1):
print(number, 'page:', page)
driver.get(page)
time.sleep(5)
soup2 = BeautifulSoup(driver.page_source, 'lxml')
room_type = soup2.select_one('div._tqmy57')
room_type = room_type.text if room_type else None
#r= requests.get(page).text
r = driver.page_source
lat = p_lat.findall(r)[0]
lng = p_lng.findall(r)[0]
room_id = page[29: link.index("?")]
titles = soup2.select_one('span._1n81at5')
titles = titles.text if titles else None
price = soup2.select_one('span._tyxjp1')
price = price.text if price else None
rating= soup2.select_one('span._12si43g')
rating = rating.text if rating else None
bedroom_area = soup2.select_one('div[class="_1a5glfg"]')
bedroom_area = bedroom_area.text if bedroom_area else None
place_offers= ', '.join([x.get_text(strip=True) for x in soup2.select('[class="sewcpu6 dir dir-ltr"]+div:nth-of-type(3) > div')])
try:
button = driver.find_element(By.CSS_SELECTOR, 'div.b6xigss button')
driver.execute_script('arguments[0].click()', button)
except Exception as ex:
print('Exception:', ex)
data.append({
'Room_ID': room_id,
'titles': titles,
'place_offers': place_offers,
'price': price,
'rating': rating,
'Bedroom_area': bedroom_area,
'Room_Type': room_type,
'Latitude': lat,
'Longitude': lng
})
df = pd.DataFrame(data)
df.to_csv('output.csv')
print(df)
EDIT:
As for calendar: every date has aria-disabled=True or aria-disabled=False and you can use aria-disabled to detect dates in calendar and later you can get value from aria-disabled like from any other attribute - item["aria-disabled"]
EDIT:
This works for me
for number, page in enumerate(detailed_pages, 1):
print(number, 'page:', page)
driver.get(page)
time.sleep(5)
# ... other code ...
xpath = '//div[#aria-label="Calendar"]//div[#data-testid]'
for item in driver.find_elements(By.XPATH, xpath):
date = item.get_attribute("data-testid")
blocked = item.get_attribute("data-is-day-blocked")
print(blocked, '|', date)
Result like this:
true | calendar-day-09/18/2022
true | calendar-day-09/19/2022
true | calendar-day-09/20/2022
false | calendar-day-09/21/2022
false | calendar-day-09/22/2022
false | calendar-day-09/23/2022
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
I am trying to scrape products details from aliexpress. I have 2 questions. First, how do I scrape category and save it in csv file in front of each product and second, how do I move to the 2nd and other pages until there are no more pages available or until page 10.
This is the code I have written to find the next pages
from bs4 import BeautifulSoup
import requests as r
page = r.get('https://www.aliexpress.com/category/200000664/jackets.html?spm=2114.11010108.102.4.650c649b8lfPOb')
soup = BeautifulSoup(page.content,'html.parser')
content = soup.find(id="pagination-bottom")
pages = content.findAll('a')
for i in pages:
print('https:' + i.get('href'))
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup import urllib.request
filename = "alibaba.csv"
f=open(filename, "w")
headers="product_name, price, Rating \n"
f.write(headers)
class alibabascrape(object):
def __init__(self, keyword):
self.keyword = keyword
self.url = f"https://www.aliexpress.com/wholesale?catId=0&initiative_id=&SearchText={keyword}"
self.driver = webdriver.Firefox(executable_path = 'c:\geckodriver.exe')
self.delay = 3
def load_alibabalist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_all_elements_located((By.ID, "form-searchbar")))
print("page is ready")
except TimeoutException:
print("Too much Time")
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name("list-item")
post_title_list = []
for post in all_posts:
title=post.text.split("\n")
name=title[0]
print(name)
price=title[2]
print(price)
rating = title[6]
print(rating)
f.write(name + "," + price + "," + rating + "\n")
post_title_list.append(post.text)
return post_title_list
def extract_category(self):
category = self.driver.find_elements_by_class_name("col-sub")
print(category)
def extract_post_urls(self):
url_list = []
html_page = urllib.request.urlopen(self.url)
soup = BeautifulSoup(html_page, "lxml")
for link in soup.findAll("a", {"class": "history-item product"}):
print(link["href"])
url_list.append(link["href"])
return url_list
keyword = "iphone"
scrapper = alibabascrape(keyword)
scrapper.load_alibabalist_url()
scrapper.extract_post_information()
scrapper.extract_category()
scrapper.extract_post_urls()
I can help you with pagination:
If you get all ref links then you can simply use for loop to iterate
all links.
If you just have prev or next page link. Then use while/ do while loop to check if the link exists and then click on it.
I'm making a Craigslist scraper to scrape the titles, prices, date, and URL and exported that info to a CSV. Now, I want Selenium to click on the post URL to navigate to the actual page, parse the page to get a span tag "Odometer" (to get mileage), and return that to my CSV file.
Here's my code so far:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
#extracting the post information such as titles, dates, and prices
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
post_info_list = []
for i in range(len(all_posts)):
post = all_posts[i]
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
#print('PRICE: ' + (price))
#print('TITLE: ' + (title))
#print('DATE: ' + date)
lst = [price, title, date]
post_info_list.append(lst)
#f=open("miata_prices.csv", "a+")
#f.write(post_info_list)
#print(post_info_list)
#df = pd.DataFrame(post_info_list)
#df.to_csv('miata_prices.csv', index=False, header=False)
print(post_info_list)
return post_info_list
def save_post_info_and_urls_to_csv(self, post_info, post_urls):
for i in range(len(post_info)):
post_info[i].append(post_urls[i])
#print(post_info)
df = pd.DataFrame(post_info)
df.to_csv('miata_prices.csv', index=False, header=False)
return post_info
#extracting post URLs
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
aTagsInLi = self.driver.find_elements_by_css_selector('li a')
self.driver.find_elements_by_css_selector('li a')[0].click()
for a in aTagsInLi:
link = a.get_attribute('href')
print(link)
link = self.driver.find_element_by_link_text('Miata')
print(link)
link.click()
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
#print(link.get('href'))
url_list.append(link.get('href'))
return url_list
#to click on URL Links and parse the HTML
def click_next_page(self):
href = driver.find_element_by_partial_link_text("result-title hdrlink")
extract_post_urls(url_list).click(href)
def quit(self):
self.driver.close()
location = "sfbay"
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()
I manage to get everything to the CSV file, but I'm stuck on how I can get Selenium to open every link in a new tab, get the odometer information, then close the tab.
I'm using this to build a dataset and eventually do some analysis with it!
I have an example how to get Selenium to open every link and get the odometer information. I used a wrapper for Selenium (SeElements) for less code. I hope you will found out how it works. So:
I'm opening your link, scrapping all links from the titles to the list. Then open every link and trying to get odometer info.
from elementium.drivers.se import SeElements
from selenium import webdriver
browser = webdriver.Chrome()
url = 'https://sfbay.craigslist.org/search/ctaquery=mazda+miata&sort=rel&max_price=6000&auto_transmission=1'
browser.get(url)
se = SeElements(browser)
titles = se.xpath('//p[#class="result-info"]/a', wait=True, ttl=5)
try:
links = []
for link in titles:
links.append(link.attribute('href'))
for link in links:
print(link)
browser.get(link)
try:
odometer = se.xpath('//span[contains(text(), "odometer")]',wait=True, ttl=2).text()
except Exception:
continue
print(odometer)
except Exception as e:
browser.quit()
raise e
Using Python, Selenium, Sublime and Firefox: I am scraping the links off of this website and would like to save the scraped pages (as html files) into a folder. However, I have been working for days on trying to get the body of these html files to dump into a dropbox folder. The problem is 1) saving the html files and 2) saving them to a dropbox folder (or any folder).
I have successfully written code that will perform a search, then scrape the links off of a series of webpages. The following code works well for that.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
import csv
import pickle
import signal
import time
def handler(signum, frame):
raise Exception('Last Resort!')
signal.signal(signal.SIGALRM,handler)
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
def waitUntilReadyBreak(browser_b,url,counter):
try:
signal.alarm(counter)
waitUntilReady(browser_b)
signal.alarm(0)
except Exception,e:
print e
signal.alarm(0)
browser_b.close()
browser_b = webdriver.Firefox()
browser_b.get(url)
waitUntilReadyBreak(browser_b,url,counter)
return browser_b
browser = webdriver.Firefox()
thisurl = 'http://www.usprwire.com/cgi-bin/news/search.cgi'
browser.get(thisurl)
waitUntilReady(browser)
numarticles = 0
elem = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
elem = browser.find_element_by_name("query")
elem.send_keys('"test"')
form = browser.find_element_by_xpath("/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[3]/td[2]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/font/input[2]").click()
nextpage = False
all_newproduct_links = []
npages = 200
for page in range(1,npages+1):
if page == 1:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_A_PAGE"
elem = browser.find_element_by_link_text('[>>]').click()
waitUntilReady(browser)
if page >=2 <= 200:
# click the dots
print page
print page
print "B4 LastLoop"
elems = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "category_links")))
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_C_PAGE"
# This is the part that will not work :(
for e in elems:
numarticles = numarticles+1
numpages = 0
numpages = numpages+1000
article_url = e.get_attribute('href')
print 'waiting'
bodyelem.send_keys(Keys.COMMAND + "2")
browser.get(article_url)
waitUntilReady(browser)
fw = open('/Users/My/Dropbox/MainFile/articlesdata/'+str(page)+str(numpages)+str(numarticles)+'.html','w')
fw.write(browser.page_source.encode('utf-8'))
fw.close()
bodyelem2 = browser.find_elements_by_xpath("//body")[0]
bodyelem2.send_keys(Keys.COMMAND + "1")
The above (for e in elems:) is meant to click on the page and create an html file containing the body of the scraped page. I seem to be missing something fundamental.
Any guidance at all would be most appreciated.
I think you are overcomplicating it.
There is at least one problem in this block:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
elems would contain a list of elements found by find_elements_by_tag_name(), but then, you are using the same elems variable in the list comprehension. As a result, when you are iterating over elems later, you are getting an error, since elems now refer to a single element and not a list.
Anyway, here is the approach I would take:
gather all the article urls first
iterate over the urls one by one and save the HTML source using the page url name as a filename. E.g. _Iran_Shipping_Report_Q4_2014_is_now_available_at_Fast_Market_Research_326303.shtml would be the article filename
The code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def isReady(browser):
return browser.execute_script("return document.readyState") == "complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
browser = webdriver.Firefox()
browser.get('http://www.usprwire.com/cgi-bin/news/search.cgi')
# make a search
query = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
query.send_keys('"test"')
submit = browser.find_element_by_xpath("//input[#value='Search']")
submit.click()
# grab article urls
npages = 4
article_urls = []
for page in range(1, npages + 1):
article_urls += [elm.get_attribute("href") for elm in browser.find_elements_by_class_name('category_links')]
browser.find_element_by_link_text('[>>]').click()
# iterate over urls and save the HTML source
for url in article_urls:
browser.get(url)
waitUntilReady(browser)
title = browser.current_url.split("/")[-1]
with open('/Users/My/Dropbox/MainFile/articlesdata/' + title, 'w') as fw:
fw.write(browser.page_source.encode('utf-8'))