i'm currently writing a script to crawl linkedin with a bot, but i get a few problems, first here's my code:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import time
import requests
print('- Importation des packages')
# Task 1: Login to Linkedin
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin login site
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[#id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[#id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
def GetURL():
page_source = BeautifulSoup(driver.page_source, features='lxml')
profiles = driver.find_elements(By.XPATH,'//*[#id="main"]/div/div/div/ul/li/div/div/div[2]/div[1]/div[1]/div/span/span/a/span/span[1]')
all_profile_URL = []
for profile in profiles:
#profile_ID = profiles.get_attribute('href')
#profile_URL = "https://www.linkedin.com" + profile_ID
profile_URL = profile.get_attribute('href')
print(profile_URL)
if profile not in all_profile_URL:
all_profile_URL.append(profile_URL)
return all_profile_URL
##Pagination
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scroll to the end of the page
sleep(3)
next_button = driver.find_element(By.CLASS_NAME, "artdeco-button__text")
driver.execute_script("arguments[0].click();", next_button)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
#Scrappe + stock dans un csv
with open('nouvow.csv', 'w', newline = '') as file_output:
headers = ['Nom_prénom', 'Job', 'Location', 'URL']
writer = csv.DictWriter(file_output, delimiter=',', lineterminator='\n',fieldnames=headers)
writer.writeheader()
for linkedin_URL in URLs_all_page:
driver.get_all(linkedin_URL)
print('- Accès au profile: ', linkedin_URL)
page_source = BeautifulSoup(driver.page_source, "html.parser")
info_div = page_source.find('div',{'class':'flex-1 mr5'})
info_loc = info_div.find_all('ul')
name = info_loc[0].find('li').get_text().strip() #supp les charactères inutiles
print('--- Nom: ', name)
location = info_loc[1].find('li').get_text().strip()
print('--- Localisation :', location)
title = info_div.find('h2').get_text().strip()
print('--- Job: ', title)
writer.writerow({headers[0]:name, headers[1]:location, headers[2]:title, headers[3]:linkedin_URL})
print('\n')
print('Ok final')
Here's my output:
Initialisation du chrome driver
Importation des id
Email ok
Mdp ok
- Task A: Connexion à LinkedinQuels type de profil souhaitez vous scraper?? fullstack blockchain
TASK B OK
Nombre de pages à scraper: 4
[]
None
None
None
None
None
None
None
None
None
None
[None, None, None, None, None, None, None, None, None, None]
So i think that i couldnt localize the profiles and get their links because i have a problem with my xpaths expressions:
[#id="main"]/div/div/div/ul/li/div/div/div[2]/div[1]/div[1]/div/span/span/a/span/span[1]
Plus, it doesnt click on the next button for pagination, and finally i get an error at this line of code:
driver.get_all(linkedin_URL)
I would really appreciate some help with that, thank you
Related
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
I was doing some crawling stuff with selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
instagram_id="username"
instagram_pw="password"
_id = driver.find_element(By.NAME, 'username')
_id.send_keys(instagram_id)
time.sleep(2)
_password = driver.find_element(By.NAME, 'password')
_password.send_keys(instagram_pw)
time.sleep(2)
login_button = driver.find_element(By.CSS_SELECTOR, '.sqdOP.L3NKy.y3zKF').click()
time.sleep(5) #press login button
_keyword = '교토'
driver.get('https://www.instagram.com/explore/tags/' + _keyword + '/') #instagram serch
driver.find_element(By.CSS_SELECTOR, 'div.v1Nh3.kIKUG._bz0w').click()
time.sleep(5) #open first post
There was no problem so far
But in here, NoSuchElementException Error occurs.
results = []
count = 200
for i in range(count):
data = driver.find_elements(By.CSS_SELECTOR, 'a.xil3i') #save hashtag info
for j in range(len(data)):
results.append(data[j].text.replace("#","")) #remove'#'
if (i+1)%10 == 0:
print('{}번째 게시물 완료'.format(i+1))
driver.find_element(By.CSS_SELECTOR, 'a._65Bje.coreSpriteRightPaginationArrow').click() #다음 게시물로 이동
time.sleep(5)
help me fix that error plz.
Thanks
I am trying to scrape the to paginate and scrape each table details from this site.
https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx
Screenshot
I need to click each details box, get directed to a new window and do it for the other records in each page. Then paginate. Here is my selenium code
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
PATH = 'chromedriver.exe'
options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--lang=en")
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.maximize_window()
driver.get('https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx')
driver.find_element_by_xpath('//*[#id="Div1"]/input').click()
def wait(locator, id):
element = WebDriverWait(driver, 50).until(
EC.presence_of_all_elements_located((locator, id))
)
return element
DATA = []
name = '//*[#id="ctl00_ContentPlaceHolder1_TxtName_I"]'
postal = '//*[#id="ctl00_ContentPlaceHolder1_TxtPostalCode_I"]'
fax = '//*[#id="ctl00_ContentPlaceHolder1_TxtFax_I"]'
province = '//*[#id="ctl00_ContentPlaceHolder1_TxtDistrict_I"]'
email = '//*[#id="ctl00_ContentPlaceHolder1_TxtEmail_I"]'
address = '//*[#id="ctl00_ContentPlaceHolder1_TxtAddress_I"]'
phone = '//*[#id="ctl00_ContentPlaceHolder1_TxtPhone_I"]'
courtroom = '//*[#id="ctl00_ContentPlaceHolder1_TxtCourtBox_I"]'
webpage = '//*[#id="ctl00_ContentPlaceHolder1_TxtUrl_I"]'
details = ['Postal Code', 'Fax', 'Calendar Province', 'Email', 'Address', 'Phone', 'Courtroom', 'Webpage']
def gotopage(page):
for p in range(page-1):
next_page = driver.find_element_by_class_name('dxWeb_pNext_Material')
action = ActionChains(driver)
action.click(next_page)
action.perform()
time.sleep(4)
def each_page(page, new):
global DATA
curr = 0
while curr < 80:
if page > 1 and new:
gotopage(page)
action = ActionChains(driver)
action.move_to_element(driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_LawyersGrid_DXPagerBottom_PSI"]')).click()
action.perform()
action.send_keys(Keys.ARROW_UP, Keys.RETURN)
action.perform()
time.sleep(17)
data = {}
action = ActionChains(driver)
detail_list = wait(By.CLASS_NAME, 'dxb-hbc')
try:
action.click(detail_list[curr])
action.perform()
except IndexError:
print(curr)
driver.back()
gotopage(page)
data['Name'] = wait(By.XPATH, name)[0].get_attribute('value')
for i, d in enumerate([postal, fax, province, email, address, phone, courtroom, webpage]):
info = driver.find_element_by_xpath(d).get_attribute(('value'))
data[details[i]] = info
DATA.append(data)
curr += 1
driver.back()
print('============SCRAPING===============')
page = 1
new=True
while page <= 50:
try:
each_page(page, new)
page += 1
except Exception as err:
print(err)
print(page)
The problem here is that this is incredibly slow because each time you say
driver.back()
It goes back to page 1 and I would need to go back to the current page it would need to go back to the page it was in.
Is there anyway i can achieve this with something like BeautifulSoup?
for scraping reactions for a post in a page facebook, i can scrape all the informations (comments, reactions, tags,...) but when i want to put them in a dataframe, a have an error (arrays must all be same length) which is normal because sometime there's someone who put only a comment and an another one only a tag, so i have lists with differents length. i think i can put a conditionnal if but may there another optimized solution...
for example len(tag) =2, len(usr) = 17, len(commentaire)=12.
thanks :)
#imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from bs4 import BeautifulSoup
import time
from time import sleep
from lxml import html
import logging as log
import pandas as pd
#chemin de chrome et desactivation des adds automatique de FB anti scrape
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver.exe',
chrome_options=chrome_options)
#open FB
driver.get("http://www.facebook.com")
print ("facebook page log ok")
sleep(1)
#reperage de user et pass (css_selector)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='pass']")))
##reperage de user et pass et click (xpath)
#username = driver.find_element(By.XPATH,"//input[contains(#id,'email')]")
#password = driver.find_element(By.XPATH,"//input[contains(#id,'pass')]")
usr=input('Enter Email Id:')
pwd=input('Enter Password:')
#enter données
username.clear()
username.send_keys(usr)
print ("Email Id entered")
sleep(1)
password.clear()
password.send_keys(pwd)
print ("Pass entered")
#reperage bouton log in et click
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"button[type='submit']"))).click()
print("login Successfully...")
time.sleep(5)
post = 'https://mbasic.facebook.com/AJSTunisie/posts/6452145678144265'
#open the webpage
driver.get(post)
page = requests.get(post)
df_comm = pd.DataFrame(columns = ['post_url', 'user', 'commentaire', 'tag', 'user_url'])
page_count = 0
while (True ):
#scrape les reactions
tree = html.fromstring(driver.page_source)
user = tree.xpath("//div[#class='eb']/div/h3/a/text()")
commentaire = tree.xpath("//div[#class='eb']/div/div[1]/text()")
tag = tree.xpath("//div[#class='eb']/div/div[1]/a/text()")
user_url = tree.xpath("//div[#class='eb']/div/h3/a/#href")
data= {'post_url':[post]*len(user), 'user':user, 'commentaire':commentaire, 'tag':tag,
'user_url':user_url}
df_comm = df_comm.append(pd.DataFrame(columns = df_comm.columns,data=data))
#Check if more reaction exist ("En afficher davantage" existe ou pas)
next_link = tree.xpath("//div[#class='eb eu']/a/#href")
if len(next_link)!= 0:
driver.find_element_by_xpath("//div[#class='eb eu']/a/#href").click()
page_count = page_count+1
else :
next_link = ''
break
df_comm =df_comm.reset_index()
#df_comm.to_csv(path,index=False)
driver.close()
You should do it in different way.
First you should find all comments - elements with text, user, tag, etc. - and next you should use for-loop to work with every comment separatelly. If loop you should use relavite xpath (starting at .) to get only information for this single comment. And then you can see if you have missing tag or other item and you can put some default value - i.e. empty string.
This way every comment will have all values so every row in CSV will have the same size.
This way also resolve other problem - in previous method you could get first comment with tag from second comment and you couldn't control it.
To make code simpler I put every comment on list of rows and later I convert all to DataFrame.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import time
from lxml import html
import logging as log
import pandas as pd
#chemin de chrome et desactivation des adds automatique de FB anti scrape
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
#driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver.exe', chrome_options=chrome_options)
driver = webdriver.Chrome(chrome_options=chrome_options)
#open FB
driver.get("http://www.facebook.com")
print ("facebook page log ok")
time.sleep(1)
#reperage de user et pass (css_selector)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='pass']")))
##reperage de user et pass et click (xpath)
#username = driver.find_element(By.XPATH,"//input[contains(#id,'email')]")
#password = driver.find_element(By.XPATH,"//input[contains(#id,'pass')]")
usr = input('Enter Email Id:')
pwd = input('Enter Password:')
#enter données
username.clear()
username.send_keys(usr)
print ("Email Id entered")
#time.sleep(1)
password.clear()
password.send_keys(pwd)
print ("Pass entered")
#reperage bouton log in et click
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"button[type='submit']"))).click()
print("login Successfully...")
time.sleep(5)
post_url = 'https://mbasic.facebook.com/AJSTunisie/posts/6452145678144265'
#open the webpage
driver.get(post_url)
all_rows = []
page_count = 0
while True:
#scrape les reactions
page_count += 1
print('\n--- page:', page_count, '---\n')
tree = html.fromstring(driver.page_source)
# find all comments
all_comments = tree.xpath("//div[#class='ec']/div")
print('len(all_comments):', len(all_comments))
# work with every comment separatelly
for comment in all_comments:
user = comment.xpath(".//h3/a/text()") # relative xpath starting at `.`
print('user:', user)
user = user[0] if user else "" # set default value
print('user:', user)
commentaire = comment.xpath(".//div[1]/text()") # relative xpath starting at `.`
print('commentaire:', commentaire)
commentaire = commentaire[0] if commentaire else "" # set default value
print('commentaire:', commentaire)
tag = comment.xpath(".//div[1]/a/text()") # relative xpath starting at `.`
print('tag:', tag)
tag = tag[0] if tag else "" # set default value
print('tag:', tag)
user_url = comment.xpath(".//h3/a/#href") # relative xpath starting at `.`
print('user_url:', user_url)
user_url = user_url[0] if user_url else "" # set default value
print('user_url:', user_url)
all_rows.append([post_url, user, commentaire, tag, user_url])
#Check if more reaction exist ("En afficher davantage" existe ou pas)
next_link = driver.find_elements_by_xpath("//div[#class='ec es']/a")
print('---')
print('len(next_link):', len(next_link))
if next_link:
next_link[0].click()
time.sleep(2)
else:
break
# - after loop -
df = pd.DataFrame(all_rows, columns=['post_url', 'user', 'commentaire', 'tag', 'user_url'])
print(df)
df.to_csv('output.csv', index=False)
#driver.close()
scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])