I am scraping this website :
https://login.aviva.com.sg/directinsurance/homeinsurance.htm
I want to fill all the form elements from a csv file (we will call it "profil". When I try to fill the form with a unique "profil" everything is working fine. But when I do my loop on the different profiles I have multiple problems:
sometimes I can't "get" the full address from the postalcode (see below) and so I can't get the final quotation
the driver is refreshing but I don't have the quote from the different people in profil.
Here is a type of profil that I have:
profil = [["MRS ","Corinne","SIMON","F","M","600 ","No, for myself and my family","72603190","2017-H1","CO ","Ridout Road","10","91 - 124","27 - 38","099197","S4553141D","1958","5","1"],
["MS ","Corinne","MOREAU","F","D","610 ","Yes, for myself","63856280","2017-H1","CO ","Stevens","10","38 - 208","24 - 40","099198","S9186686B","1999","10","1"],
["MDM ","Corinne","DUBOIS","F","W","620 ","Yes,for my family","71852991","2017-H1","CO ","Stevens","10","38 - 208","24 - 40","099200","S2243858A","1974","2","1"]
]`
This is the python code that I have made so fare :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import csv
driver = webdriver.Firefox()
driver.get("https://login.aviva.com.sg/directinsurance/homeinsurance.htm")
for people in profil:
dropdown_salutation =
Select(driver.find_element_by_name("person.salutationRef"))
dropdown_occupation =
Select(driver.find_element_by_name("person.occupationRef"))
dropdown_maritalstatus =
Select(driver.find_element_by_name("person.maritalstat"))
dropdown_gender=Select(driver.find_element_by_name("person.gender"))
dropdown_dobDay= Select(driver.find_element_by_name("dobDay"))
dropdown_dobMonth=Select(driver.find_element_by_name("dobMonth"))
dropdown_dobYear=Select(driver.find_element_by_name("dobYear"))
dropdown_declaration1=Select(driver.find_element_by_name("declaration1"))
dropdown_declaration2=Select(driver.find_element_by_name("declaration2"))
#now we look for all the other element that we can fill (we select by id first)
FamilyName_input = driver.find_element_by_id("surname")
GivenName_input = driver.find_element_by_id("givname")
NRIC_input = driver.find_element_by_id("nric")
PostalCode = driver.find_element_by_id("postalCode")
MobileNo = driver.find_element_by_id("textfield5")
Email = driver.find_element_by_id("email")
# Then we fill everything
dropdown_salutation.select_by_value(people[0])
GivenName_input.send_keys(people[1])
FamilyName_input.send_keys(people[2])
dropdown_gender.select_by_value(people[3])
dropdown_maritalstatus.select_by_value(people[4])
dropdown_occupation.select_by_value(people[5])
MobileNo.send_keys(people[7])
NRIC_input.send_keys(people[15])
dropdown_dobYear.select_by_value("people[16]")
dropdown_dobMonth.select_by_value(people[17])
dropdown_dobDay.select_by_value(people[18])
Email.send_keys("ada#hotmail.com")
dropdown_declaration1.select_by_value("Y")
dropdown_declaration2.select_by_value("Y")
PostalCode.send_keys(people[14])
wait = WebDriverWait(driver, 30)
# Now we can get the full address based on the postal code we provide
#here I have a first problem
driver.find_element_by_id("btnAddress").click()
wait = WebDriverWait(driver, 30)
element = wait.until(EC.element_to_be_clickable((By.ID, 'immediateFamilySaf')))
dropdown_declaration3=
Select(driver.find_element_by_name("policy.immediateFamilySaf"))
dropdown_declaration3.select_by_value("N")
# Now we click on next to move forward on the second page of the form
Next = driver.find_element_by_css_selector("a[onclick*=checkFirstTab]")
Next.click()
UnitNo =
driver.find_element_by_css_selector("a[onclick*=proceedNoUnitNo]")
UnitNo.click()
#Now we can fill the "cover needed" form
dropdown_plan=Select(driver.find_element_by_name("homeProd.planTypeRef"))
dropdown_dwelling =
Select(driver.find_element_by_name("homeProd.dwellingTypeRef"))
dropdown_insureadr=
Select(driver.find_element_by_name("homeProd.addressType"))
dropdown_coverday=Select(driver.find_element_by_name("coverStartDay"))
dropdown_covermonth=Select(driver.find_element_by_name("coverStartMonth"))
dropdown_coveryear=Select(driver.find_element_by_name("coverStartYear"))
dropdown_plan.select_by_value("HI ")
dropdown_dwelling.select_by_value(people[9])
dropdown_insureadr.select_by_value("S")
dropdown_coverday.select_by_value("1")
dropdown_covermonth.select_by_value("4")
dropdown_coveryear.select_by_value("2018")
# Now we can grab the next button and pass to the third tab
SecondTab = driver.find_element_by_name("_target0")
SecondTab.click()
#now we can grab the quote
ThirdTab = driver.find_element_by_name("_target1")
ThirdTab.click()
time.sleep(3)
driver.save_screenshot('img' + people[2] + '.png')
html= driver.page_source
doc=
# We can feed that into Beautiful Soup
doc = BeautifulSoup(html, "html.parser")
rows = doc.find('table', id='table-termsofplan').find_all('td', attrs={'class': None})
premiums = []
for row in rows:
# Find the ones that don't have 'style' as an attribute
if 'style' in row.attrs:
# Skip it! It's a header or footer row
pass
else:
premium={
'type of plan': rows[1].text,
'12 Months premium':rows[2].text,
'24 Months premium':rows[3].text,
'36 Months premium':rows[4].text,
'Total Premium 12 Months':rows[10].text,
'Total Premium 24 Months':rows[11].text,
'Total Premium 36 Months':rows[12].text,
'Goods and services Tax 12 Months':rows[14].text,
'Goods and services Tax 24 Months':rows[15].text,
'Goods and services Tax 36 Months':rows[16].text,
'Single Payment 12 Months':rows[19].text,
'Single Payment 24 Months':rows[20].text,
'Single Payment 36 Months':rows[21].text,
}
premiums.append(premium)
driver.get("https://login.aviva.com.sg/directinsurance/homeinsurance.htm")
driver.close()
import pandas as pd
premium_df = pd.DataFrame(premiums)
premium_df.to_csv("premium.csv", index=False)
Related
I'm pretty new to web scraping and would appreciate any advice for the scenarios below:
I'm trying to produce a home loans listing table using data from https://www.canstar.com.au/home-loans/
I'm mainly trying to get listings values like the ones below:
Homestar Finance | Star Essentials P&I 80% | Variable
Unloan | Home Loan LVR <80% | Variable
TicToc Home Loans | Live-in Variable P&I | Variable
ubank | Neat Home Loan Owner Occupied P&I 70-80% | Variable
and push them into a nested table
results = [[Homestar Finance, Star Essentials P&I 80%, Variable], etc, etc]
My first attempt, I've used BeautifulSoup entirely and practice on an offline version of the site.
import pandas as pd
from bs4 import BeautifulSoup
with open('/local/path/canstar.html', 'r') as canstar_offline :
content = canstar_offline.read()
results = [['Affiliate', 'Product Name', 'Product Type']]
soup = BeautifulSoup(content, 'lxml')
for listing in soup.find_all('div', class_='table-cards-container') :
for listing1 in listing.find_all('a') :
if listing1.text.strip() != 'More details' and listing1.text.strip() != '' :
results.append(listing1.text.strip().split(' | '))
df = pd.DataFrame(results[1:], columns=results[0]).to_dict('list')
df2 = pd.DataFrame(df)
print(df2)
I pretty much got very close to what I wanted, but unfortunately it doesn't work for the actual site cause it looks like I'm getting blocked for repeated requests.
So I tried this again on Selenium but now I'm stuck.
I tried using as much of the transferrable filtering logic that I used from BS, but I can't get anywhere close to what I had using Selenium.
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
listing = table.find_element(By.TAG_NAME, 'a')
print(listing.text)
This version (above) only returns one listing (I'm trying to get the entire table through iteration)
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
# listing = table.find_element(By.TAG_NAME, 'a')
print(table.text)
This version (above) looks like it gets all the text from the 'table-cards-container' class, but I'm unable to filter through it to just get the listings.
I think you can try something like this, I hope the comments in the code explain what it is doing.
# Needed libs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Initiate the driver and navigate
driver = webdriver.Chrome()
url = 'https://www.canstar.com.au/home-loans'
driver.get(url)
# We save the loans list
loans = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//cnslib-table-card")))
# We make a loop once per loan in the loop
for i in range(1, len(loans)):
# With this Xpath I save the title of the loan
loan_title = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//a)[1]"))).text
print(loan_title)
# With this Xpath I save the first percentaje we see for the loan
loan_first_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[1]"))).text
print(loan_first_percentaje)
# With this Xpath I save the second percentaje we see for the loan
loan_second_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[3]"))).text
print(loan_second_percentaje)
# With this Xpath I save the amount we see for the loan
loan_amount = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[5]"))).text
print(loan_amount)
I manage to scrape a lot of information from AirBnB but i have to questions.
This is my code for scraping several information such as price, rating etc.
Imports
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests, re
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
time.sleep(5)
Main code
url = 'https://www.airbnb.com/s/Thessaloniki--Greece/homes?tab_id=home_tab&flexible_trip_lengths%5B%5D=one_week&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJ7eAoFPQ4qBQRqXTVuBXnugk&query=Thessaloniki%2C%20Greece&date_picker_type=calendar&search_type=user_map_move&price_filter_input_type=0&ne_lat=40.66256734970964&ne_lng=23.003752862853986&sw_lat=40.59051931897441&sw_lng=22.892087137145978&zoom=13&search_by_map=true&federated_search_session_id=1ed21e1c-0c5e-4529-ab84-267361eac02b&pagination_search=true&items_offset={offset}§ion_offset=2'
data = []
for offset in range(0,40,20):
driver.get(url.format(offset=offset))
time.sleep(2)
soup=BeautifulSoup(driver.page_source, 'lxml')
detailed_pages = []
for card in soup.select('div[class="c4mnd7m dir dir-ltr"]'):
link = 'https://www.airbnb.com' + card.select_one('a[class="ln2bl2p dir dir-ltr"]')['href']
detailed_pages.append(link)
for page in detailed_pages:
driver.get(page)
time.sleep(3)
soup2=BeautifulSoup(driver.page_source, 'lxml')
room_type = soup2.select_one('div._tqmy57')
room_type = room_type.text if room_type else None
r= requests.get(page)
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
lat = p_lat.findall(r.text)[0]
lng = p_lng.findall(r.text)[0]
room_id = page[29: link.index("?")]
titles = soup2.select_one('span._1n81at5')
titles = titles.text if titles else None
price = soup2.select_one('span._tyxjp1')
price = price.text if price else None
rating= soup2.select_one('span._12si43g')
rating = rating.text if rating else None
Bedroom_area = soup2.select_one('div[class="_1a5glfg"]')
Bedroom_area = Bedroom_area.text if Bedroom_area else None
place_offers= ', '.join([x.get_text(strip=True) for x in soup2.select('[class="sewcpu6 dir dir-ltr"]+div:nth-of-type(3) > div')])
data.append({
'Room_ID':room_id,
'titles':titles,
'place_offers': place_offers,
'price':price,
'rating':rating,
'Bedroom_area': Bedroom_area,
'Room_Type': room_type,
'Latitude':lat,
'Longitude':lng
})
df=pd.DataFrame(data)
df
The first question is how can I click on buttons like amenities, description etc. and scrape them, since in the landing page we just have some information about this but not all the info.
I know that there is a function .click() in sellenium but i am trying the following code:
soup2.select_one('div.b6xigss dir dir-ltr').click()
but I am getting that error: 'NoneType' object has no attribute 'click' .
The second question is how can I scrape the calendar data and which dates are blocked or not ?
There are few problems:
click() works only with Selenium (driver.find_element()) but not with BeautifulSoup (soup2.select_one()) - so first you have to use different function
for some reasons it can't find 'div.b6xigss.dir.dir-ltr' but it finds 'div.b6xigss button' (To make sure I search button because div can be "unclickable")
there is message about cookies and it hides this element and selenium can't click. It would need to close this message (accept cookies), or it would need to scroll page to move button in visible place, or it needs to use JavaScript (driver.execute_script()) to click it.
This works for me
button = driver.find_element(By.CSS_SELECTOR, 'div.b6xigss button')
driver.execute_script('arguments[0].click()', button)
Miniamal working code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
url = 'https://www.airbnb.com/s/Thessaloniki--Greece/homes?tab_id=home_tab&flexible_trip_lengths%5B%5D=one_week&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJ7eAoFPQ4qBQRqXTVuBXnugk&query=Thessaloniki%2C%20Greece&date_picker_type=calendar&search_type=user_map_move&price_filter_input_type=0&ne_lat=40.66256734970964&ne_lng=23.003752862853986&sw_lat=40.59051931897441&sw_lng=22.892087137145978&zoom=13&search_by_map=true&federated_search_session_id=1ed21e1c-0c5e-4529-ab84-267361eac02b&pagination_search=true&items_offset={offset}§ion_offset=2'
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
data = []
for offset in range(0, 40, 20):
print('offset:', offset)
driver.get(url.format(offset=offset))
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'lxml')
detailed_pages = []
for card in soup.select('div[class="c4mnd7m dir dir-ltr"] a[class="ln2bl2p dir dir-ltr"]'):
link = 'https://www.airbnb.com' + card['href']
detailed_pages.append(link)
print('len(detailed_pages):', len(detailed_pages))
for number, page in enumerate(detailed_pages, 1):
print(number, 'page:', page)
driver.get(page)
time.sleep(5)
soup2 = BeautifulSoup(driver.page_source, 'lxml')
room_type = soup2.select_one('div._tqmy57')
room_type = room_type.text if room_type else None
#r= requests.get(page).text
r = driver.page_source
lat = p_lat.findall(r)[0]
lng = p_lng.findall(r)[0]
room_id = page[29: link.index("?")]
titles = soup2.select_one('span._1n81at5')
titles = titles.text if titles else None
price = soup2.select_one('span._tyxjp1')
price = price.text if price else None
rating= soup2.select_one('span._12si43g')
rating = rating.text if rating else None
bedroom_area = soup2.select_one('div[class="_1a5glfg"]')
bedroom_area = bedroom_area.text if bedroom_area else None
place_offers= ', '.join([x.get_text(strip=True) for x in soup2.select('[class="sewcpu6 dir dir-ltr"]+div:nth-of-type(3) > div')])
try:
button = driver.find_element(By.CSS_SELECTOR, 'div.b6xigss button')
driver.execute_script('arguments[0].click()', button)
except Exception as ex:
print('Exception:', ex)
data.append({
'Room_ID': room_id,
'titles': titles,
'place_offers': place_offers,
'price': price,
'rating': rating,
'Bedroom_area': bedroom_area,
'Room_Type': room_type,
'Latitude': lat,
'Longitude': lng
})
df = pd.DataFrame(data)
df.to_csv('output.csv')
print(df)
EDIT:
As for calendar: every date has aria-disabled=True or aria-disabled=False and you can use aria-disabled to detect dates in calendar and later you can get value from aria-disabled like from any other attribute - item["aria-disabled"]
EDIT:
This works for me
for number, page in enumerate(detailed_pages, 1):
print(number, 'page:', page)
driver.get(page)
time.sleep(5)
# ... other code ...
xpath = '//div[#aria-label="Calendar"]//div[#data-testid]'
for item in driver.find_elements(By.XPATH, xpath):
date = item.get_attribute("data-testid")
blocked = item.get_attribute("data-is-day-blocked")
print(blocked, '|', date)
Result like this:
true | calendar-day-09/18/2022
true | calendar-day-09/19/2022
true | calendar-day-09/20/2022
false | calendar-day-09/21/2022
false | calendar-day-09/22/2022
false | calendar-day-09/23/2022
Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.
To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))
The problem there is that the selector depends on whether the window is expanded or not
scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])
I have a script that open http finds an html table and stores all the values of that table to a csv.
For a second site I wish to copy information from the same script is not working. It fails to produce values for Column one, but still produces the correct values for Column 2 and 3.
I believe this is because on site one the first column looks like:
4216278
where as on the second site the first column looks like:
4268023
I believe the leading white space before the '4268023' may be the problem?
In the code below col[0] comes back empty, while col[1] and col[2] return the correct values.
html = driver.page_source
soup = BeautifulSoup(html)
table = soup.find("table", border=1)
header = (['Claim_Number', 'Township_Area', 'Date Cancelled_Forfeited'])
records = []
for row in table.findAll('tr')[1:]:
col = row.findAll('td')
Claim_Number = col[0].string
Township_Area = col[1].string
Date_Cancelled_Forfeited = col[2].string
Claim_Cancel = (Claim_Number, Township_Area, Date_Cancelled_Forfeited)
records.append(Claim_Cancel)
working script
import selenium
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# go to the MCI Main Menu KENORA Mining Division page
driver.get("http://www.mci.mndm.gov.on.ca/Claims/Cf_Claims/clm_css.cfm?Div=10")
# find first text box
inputElement = driver.find_element_by_name("Claim_View__Claim_Number")
# type in the text box, nothing (could be the claim number for a specific search
inputElement.send_keys("")
# submit the form (click on the search button)
inputElement.submit()
#Read the HTML table with Client info, Store the info, and print to a file.
html = driver.page_source
soup = BeautifulSoup(html)
table = soup.find("table", border=1)
header = (['Claim_Number', 'Township_Area', 'Recorded_Holder', 'Due_Date'])
records = []
for row in table.findAll('tr')[1:]:
col = row.findAll('td')
Claim_Number = col[0].string
Township_Area = col[1].string
Recorded_Holder = col[2].string
Due_Date = col[3].string
Claim_Att = (Claim_Number, Township_Area, Recorded_Holder, Due_Date)
records.append(Claim_Att)
with open('MCItest2.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(row for row in records if row)
scripts run to completion, but does not produce a complete csv, column 1 is left blank
import csv
import selenium
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# go to the MCI Main Menu KENORA Mining Division page
driver.get("http://www.mci.mndm.gov.on.ca/Claims/Cf_Claims/clm_cas.cfm?Div=10")
# find first text box
inputElement = driver.find_element_by_name("Claim_Cancellation__Canc_Period")
inputbutton = driver.find_element_by_xpath("//input[#name='Claim_Cancellation__Canc_Period' and #value='90']").click();
# type in the text box, nothing (could be the claim number for a specific search
inputElement.send_keys("")
# submit the form (click on the search button)
inputElement.submit()
#Read the HTML table with Client info, Store the info, and print to a file.
html = driver.page_source
soup = BeautifulSoup(html)
table = soup.find("table", border=1)
header = (['Claim_Number', 'Township_Area', 'Date Cancelled_Forfeited'])
records = []
for row in table.findAll('tr')[1:]:
col = row.findAll('td')
Claim_Number = col[0]' '.string
Township_Area = col[1].string
Date_Cancelled_Forfeited = col[2].string
Claim_Cancel = (Claim_Number, Township_Area, Date_Cancelled_Forfeited)
records.append(Claim_Cancel)
with open('MCICancel2.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(row for row in records if row)