So I got this page (https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp) from where I want to extract data.
You can get the data of the people by only putting numbers in the "Matricula" field, that part is easy, but when it generates the new page and I want to get get the data from a specific div it gives you NONE, and checking the HTML it use to browse the data, it's the same as the page I'm putting the numbers to access the data.
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def clear(): return os.system("cls")
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')
driver_path = 'C:\\Users\\Menem Lo Hizo\\Downloads\\chromedriver_win32\\chromedriver.exe'
driver = webdriver.Chrome(driver_path, chrome_options=options)
driver.get('https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp')
matricula = driver.find_element_by_id("matricula")
matricula.send_keys("2")
matricula.send_keys(Keys.RETURN)
try:
div = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-8 col-md-offset-2"))
)
except:
driver.quit()
clear()
print(div)
This is my code.
Logging ones network traffic when submitting the form reveals an HTTP POST request being made to productoresactivos.asp, the response of which is HTML. Simply imitate that request:
def get_columns():
import requests
from bs4 import BeautifulSoup as Soup
url = "https://www.ssn.gob.ar/storage/registros/productores/productoresactivos.asp"
payload = {
"socpro": "PAS",
"matricula": "2",
"apellidorazonsocial": "",
"docNro": "",
"Submit": "Buscar"
}
response = requests.post(url, data=payload)
response.raise_for_status()
soup = Soup(response.content, "html.parser")
for column in soup.select("div[class^=\"col-md-\"]"):
yield " ".join(column.get_text().strip().split())
def main():
for text in get_columns():
print(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
Página 1 de 1
Matrícula: 2
Nombre: CABELLO DE GADANO, MARIA CRISTINA
Documento: DNI - 5263977
CUIT: 27-05263977-3
Ramo: PATRIMONIALES Y VIDA
Domicilio: AV. CORDOBA 669 12º B
Localidad: CIUDAD AUTONOMA BS.AS.
Provincia CIUDAD AUTONOMA
Cod. Postal: 1054
Teléfonos: 4311-5860
E-mail:
Nro. de Resolución 17053
Fº de Resolución 06/01/1983
Nro. de Libro: 01
Nro. de Rubrica: 20395
Fº. de Rubrica: 21/08/1992
Nro. de Libro: 1
Fº. de Rubrica: 20396
Fº. de Rubrica: 21/08/1992
>>>
few things :
You need explicit waits
When You hit enter on first page, a new tabs opens up, you need to switch to that windows
Code :
driver.get("https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp")
wait = WebDriverWait(driver, 10)
org_handles = driver.window_handles
wait.until(EC.element_to_be_clickable((By.ID, "matricula"))).send_keys("2" + Keys.RETURN)
new_handles = driver.window_handles
driver.switch_to.window(new_handles[1])
div = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".col-md-8.col-md-offset-2")))
print(div.text)
Related
I'm coding a python program to scrap this site using Selenium and Beatifulsoup:
https://www.argentina.gob.ar/desarrollosocial/registrocuidadores
I was able to go through the steps to access the first table I need (that's what the code does) but then the webdriver close itself and I get this error in the console:
Traceback (most recent call last):
File "/Users/martin/Desktop/Scrap/scrapy1-3.py", line 33, in
select2.select_by_visible_text(option2.text)
^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/webelement.py", line 89, in text
return self._execute(Command.GET_ELEMENT_TEXT)["value"]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/webelement.py", line 410, in _execute
return self._parent.execute(command, params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 444, in execute
self.error_handler.check_response(response)
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/errorhandler.py", line 249, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=109.0.5414.87)
This is my code right now:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome(
'/Users/martin/Downloads/chromedriver_mac64/chromedriver')
# Abrir el sitio web
driver.get("https://registroncd.senaf.gob.ar/ListadoCuidadores.aspx")
# Esperar a que la página cargue antes de hacer scraping
time.sleep(3)
boton1 = driver.find_element(
By.XPATH, "//*[#id='ContentPlaceHolder1_DropDownListProv']")
select1 = Select(boton1)
options1 = select1.options
for option1 in options1:
select1.select_by_visible_text(option1.text)
time.sleep(3) # wait for the page to load
boton2 = driver.find_element(
By.XPATH, "//*[#id='ContentPlaceHolder1_DropDownListLoc']")
select2 = Select(boton2)
options2 = select2.options
for i in range(1, len(options2)):
option2 = options2[i]
select2.select_by_visible_text(option2.text)
time.sleep(3) # wait for the page to load
boton3 = driver.find_element(By.ID, "ContentPlaceHolder1_ButtonBuscar")
boton3.click()
time.sleep(3)
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located(
(By.ID, "ContentPlaceHolder1_GridView1")))
soup = BeautifulSoup(driver.page_source, "html.parser")
table = soup.find("table", class_="gridview")
if table:
rows = table.find_all("tr")
for row in rows:
cells = row.find_all("td")
for cell in cells:
print(cell.text)
else:
print("La tabla no ha sido encontrada")
To use only bs4 without selenium you can try:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
# first page:
url = 'https://registroncd.senaf.gob.ar/ListadoCuidadores.aspx'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
data = {}
for inp in soup.select('input[value]'):
data[inp['name']] = inp['value']
soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
df = pd.read_html(str(soup))[0]
print(df[:-1])
# for additional pages:
for page in range(2, 4):
data = {}
for inp in soup.select('input[value]'):
data[inp['name']] = inp['value']
del data['ctl00$ContentPlaceHolder1$ButtonBuscar']
data['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$GridView1'
data['__EVENTARGUMENT'] = f"Page${page}"
soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
df = pd.read_html(str(soup))[0]
print(df[:-1])
Prints:
Nombre Apellido Provincia Localidad Telefono Email Capacitaciones
0 BLANCA BEATRIZ AGUILAR Buenos Aires 25 de Mayo 0234515532692 lucianamaraportilla#gmail.com Ver
1 RUBEN OSVALDO CABALLERO Buenos Aires 25 de Mayo 0234515400320 lucianamaraportilla#gmail.com Ver
2 DAVID ALEJANDRO GIGLIO Buenos Aires 25 de Mayo 0234515517152 lucianamaraportilla#gmail.com Ver
3 LILIANA RAQUEL MACHAROLI Buenos Aires 25 de Mayo 0234515438703 lucianamaraportilla#gmail.com Ver
4 PATRICIA ELIZABETH MATTIA Buenos Aires 25 de Mayo 0234515433654 lucianamaraportilla#gmail.com Ver
5 ANDREA SILVINA PEREZ Buenos Aires 25 de Mayo 0234515513612 lucianamaraportilla#gmail.com Ver
6 NATALIA CLARISA LOPEZ Buenos Aires 25 de Mayo 0234515400562 lucianamaraportilla#gmail.com Ver
7 LUCIANA KARINA MARA Buenos Aires 25 de Mayo 0234515668788 lucianamaraportilla#gmail.com Ver
...and so on.
Try using the WebDriverWait
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.ID, "ContentPlaceHolder1_ButtonBuscar"))).click()
import
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
Hey guys I have a problem with the if-else conditions.I was creating the following bot that searches and alerts me when theres a appointment available but I can't make the if-else conditons work at the final lines of the code,the bot doesn't respect the if-else conditions I've tried changing several times the code but no idea of how to resolve this problem.Thanks for the help.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import time
import re
import os
import winsound
duration = 2000 # milliseconds
freq = 900 # Hz
lets_go = True
while lets_go == True:
browser = webdriver.Chrome()
browser.implicitly_wait(30)
browser.maximize_window()
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
browser.get("https://icp.administracionelectronica.gob.es/icpplus/index.html")
browser.verificationErrors = []
cookie_kill = browser.find_element_by_id("cookie_action_close_header")
cookie_kill.click()
#sleep(1)
madrid = browser.find_element_by_xpath('//*[#id="form"]/option[34]')
madrid.click()
#sleep(1)
accept = browser.find_element_by_id("btnAceptar")
accept.click()
#sleep(1)
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
tramites_group = browser.find_element_by_xpath('/html/body/div[1]/div[2]/main/div/div/section/div[2]/form[1]/div[3]/div[1]/div[2]/div/fieldset/div[2]/select/option[3]')
tramites_group.click()
sleep(1)
aceptar = browser.find_element_by_id("btnAceptar")
aceptar.click()
sleep(1)
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
enter_button = browser.find_element_by_id('btnEntrar')
enter_button.click()
sleep(1)
passport = browser.find_element_by_id("rdbTipoDocPas")
passport.click()
passport_number = browser.find_element_by_id("txtIdCitado").send_keys("123456789")
person_name = browser.find_element_by_id("txtDesCitado").send_keys("BORIS JOHNSON")
person_birth = browser.find_element_by_id("txtAnnoCitado").send_keys("1900")
nationality = browser.find_element_by_xpath('/html/body/div[1]/div[2]/main/div/div/section/div[2]/form/div/div/div[1]/div[5]/div/div/div/div/span/select/option[200]')
nationality.click()
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
sleep(1)
enviar = browser.find_element_by_id("btnEnviar")
browser.execute_script("arguments[0].click();", enviar)
sleep(1)
enviar = browser.find_element_by_id("btnEnviar")
browser.execute_script("arguments[0].click();", enviar)
sleep(1)
no_appointments = browser.page_source.find("En este momento no hay citas disponibles.")
if no_appointments:
browser.close()
time.sleep(120)
else:
winsound.Beep(freq, duration)
print("found")
lets_go = False
break
page_source returns a normal Python string. The find method of a string does not return a boolean True/False. It returns the starting character number if found, and -1 if not found. Thus, you want:
no_appointments = browser.page_source.find("En este momento no hay citas disponibles.")
if no_appointments >= 0:
browser.close()
time.sleep(120)
You might consider whether it makes more sense to write:
if "En este momento no hay citas disponibles" in browser.page_source:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2')
soup = BeautifulSoup(driver.page_source, 'lxml')
soup.find_all('div',class_='content-review')
# it always return empty list
# I want to scrap all of review contents from e.g "<div class="content-review" id="325243269"> Super Shampoo, meine Haare glänzt und sind sehr weich. 😍 </div>"
I try multiple ways but it always return empty list.
How should I do in order to solve this problem?
Yo need to wait until page will completely loaded:
driver.get(url)
timeout = 5
try:
element_present = EC.presence_of_element_located((By.CLASS_NAME, 'content-review'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
soup = BeautifulSoup(driver.page_source, 'lxml')
for review in soup.find_all('div', class_='content-review'):
print(review.getText().strip())
Add necessary libs:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
OUTPUT:
Super Shampoo, meine Haare glänzt und sind sehr weich. 😍
Ich verwende dieses Produkt seit kurzem und ich bin begeistert, so ein pflegendes Shampoo habe ich noch nie gehabt. Er gibt meinen Haar Glanz, Geschmeidigkeit und Fülle. Ich kann es nur empfehlen.
Zufrieden
Tolles Shampoo
Sehr gut
Second option - find request with reviews and get data:
url = "https://staticw2.yotpo.com/batch/1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR/80053469-250"
payload='methods=%5B%7B%22method%22%3A%22main_widget%22%2C%22params%22%3A%7B%22pid%22%3A%2280053469-250%22%2C%22page%22%3A2%2C%22order_metadata_fields%22%3A%7B%7D%2C%22widget_product_id%22%3A%2280053469-250%22%7D%7D%5D&app_key=1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR'
response = requests.request("POST", url, data=payload)
soup = BeautifulSoup(response.json()[0]['result'], 'lxml')
for review in soup.find_all('div', class_='content-review'):
print(review.getText().strip())
With same output
Main issue here is that you need to close 'accept cookies' popup which is located in shadow DOM.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
# OR: driver = webdriver.Chrome(executable_path='D:\Downloads\chromedriver\chromedriver.exe')
url = 'https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2'
driver.get(url)
webdriverWaiter = WebDriverWait(driver, 20)
webdriverWaiter.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, "body"), "class" ,"overflowHidden"))
shadow_host = driver.find_element(By.CSS_SELECTOR, '#usercentrics-root')
shadow_root = shadow_host.shadow_root
accept_cookies_button_css = "button[data-testid='uc-accept-all-button']"
# wait for accept cookies button to appear
accept_cookies_button = None
while not accept_cookies_button:
try:
accept_cookies_button = shadow_root.find_element(By.CSS_SELECTOR, accept_cookies_button_css)
except NoSuchElementException:
time.sleep(1)
# click accept cookies button
clicked = False
while not clicked:
try:
accept_cookies_button.click()
clicked = True
except ElementNotInteractableException:
time.sleep(1)
content_review_css = ".content-review"
webdriverWaiter.until(EC.visibility_of_element_located((By.CSS_SELECTOR, content_review_css)))
reviews = driver.find_elements(By.CSS_SELECTOR, content_review_css)
for rev in reviews:
print(rev.text)
Popup image:
I am trying to get prices of routes on a bus page
import requests
from bs4 import BeautifulSoup
import re
popup_linkz= list()
p=range(1, 2, 1)
for i in p:
def get_headers(session):
res = session.get("https://new.turbus.cl/turbuscl/inicio-compra")
if res.status_code == 200:
print("Got headers")
return res.text
else:
print("Failed to get headers")
def search(session):
data = {
'origenInputModal': 'Santiago',
'destinoInputModal':'Calama',
'fechaRegreso': '03-04-2021',
'fechaIda': '31-03-2021',
}
res = session.post(
"https://new.turbus.cl/turbuscl/seleccion-itinerario",
data=data) #not sure if this is the search link
if res.status_code == 200:
print("Search succeeded")
return res.text
else:
print("Search failed with error:", res.reason)
print(res.text)
def get_popup_link(html):
soup = BeautifulSoup(html, "html.parser")
for t in soup.find_all('div', {'class': 'ticket_price-value'}):
precio = t.find('[class$="ticket_price-value"]').text
#cantidad = t.select_one('[id$="lblCantidad"]').text
#descripction = t.select_one('[id$="lblDescripcion"]').text
print(f"{precio=} {precio=}")
#print()
return precio
def main():
with requests.Session() as s:
get_headers(s)
html = search(s)
popup_links = (get_popup_link(html))
print(popup_links)
# popup_linkz.extend(popup_links)
#print(popup_links)
#print(popup_linkz)
#download_html = get_download_html(s, popup_links)
# print(download_html)
#popup_linkz.extend(popup_links for i in range(0, 1, 1))
main()
#a = popup_linkz
#print(a)
enter code here
this is the link https://new.turbus.cl/turbuscl/inicio-compra
So right now I am able to find the input boxes of the search, but not sure were to run it.
I am getting this error ValueError: too many values to unpack (expected 2)
so i am not so sure of what i am failing.
would you try to enlight me in order to succeed?
I have been trying all die and get a new approach with selenium in order to get search....
is right what i am doing or was better my first approach?
-- coding: utf-8 --
"""
Created on Tue Mar 29 21:04:05 2022
#author: christian marcos
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 29 16:20:40 2022
#author: christian marcos
"""
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from pandas.io.html import read_html
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
#select and fill firs field origin
driver=wd.Chrome('C:\\chromedriver.exe')
driver.maximize_window()
driver.get('https://new.turbus.cl/turbuscl/inicio-compra')
driver.implicitly_wait(20)
driver.find_element_by_xpath('//*[#id="origen"]').click();
wait = WebDriverWait(driver, 30)
#select and fill firs field
driver.implicitly_wait(10)
driver.find_element_by_xpath('//*[#id="modalOriginCity"]/div/div/div[2]/div[2]/ul/li[1]').click();
Best regards,
The post data needed is different. In this case, you need:
{
"fechaSalidaTramo": "31/03/2022",
"mnemotecnicoCiudadOrigenTramo": "stgo",
"mnemotecnicoCiudadDestinoTramo": "aric",
"horaSalidaTramo": 0,
"horaSalidaTramoMaxima": 0,
"codigoLinea": 90,
"numeroViaje": 0,
"numeroCuentaCorrienteCliente": 0,
"codigoIdaRegreso": 1,
"cantidadAsientos": 1,
"numeroRegistros": 0
}
And the link is, https://new.turbus.cl/turbuscl/recursos/vtwst76/web1.
In python, it'll look like this:
import requests
HOST = "https://nclt.gov.in/"
LINK = "https://new.turbus.cl/turbuscl/recursos/vtwst76/web1"
DATA = '{"fechaSalidaTramo":"31/03/2022","mnemotecnicoCiudadOrigenTramo":"stgo","mnemotecnicoCiudadDestinoTramo":"aric","horaSalidaTramo":0,"horaSalidaTramoMaxima":0,"codigoLinea":90,"numeroViaje":0,"numeroCuentaCorrienteCliente":0,"codigoIdaRegreso":1,"cantidadAsientos":1,"numeroRegistros":0}'
HEADERS = {
"Content-Type": "application/json",
}
def get_route(origin, destination):
res = requests.post(LINK, data=DATA, headers=HEADERS)
if res.status_code == 200:
print("getting routes")
return res.json()
else:
print(res)
def main():
info = get_route("here", "there")
print(info)
if __name__ == "__main__":
main()
How I got to the answer:
Go to the site.
Open the network tab, so I can see requests.
Do a search, and find the request that matches.
Copy the request as a curl request and import it into postman.
Remove headers, and see if you get an error when you do a request. Repeat until you have only the needed headers.
Copy the needed headers and data, and test it using requests.
for scraping reactions for a post in a page facebook, i can scrape all the informations (comments, reactions, tags,...) but when i want to put them in a dataframe, a have an error (arrays must all be same length) which is normal because sometime there's someone who put only a comment and an another one only a tag, so i have lists with differents length. i think i can put a conditionnal if but may there another optimized solution...
for example len(tag) =2, len(usr) = 17, len(commentaire)=12.
thanks :)
#imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from bs4 import BeautifulSoup
import time
from time import sleep
from lxml import html
import logging as log
import pandas as pd
#chemin de chrome et desactivation des adds automatique de FB anti scrape
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver.exe',
chrome_options=chrome_options)
#open FB
driver.get("http://www.facebook.com")
print ("facebook page log ok")
sleep(1)
#reperage de user et pass (css_selector)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='pass']")))
##reperage de user et pass et click (xpath)
#username = driver.find_element(By.XPATH,"//input[contains(#id,'email')]")
#password = driver.find_element(By.XPATH,"//input[contains(#id,'pass')]")
usr=input('Enter Email Id:')
pwd=input('Enter Password:')
#enter données
username.clear()
username.send_keys(usr)
print ("Email Id entered")
sleep(1)
password.clear()
password.send_keys(pwd)
print ("Pass entered")
#reperage bouton log in et click
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"button[type='submit']"))).click()
print("login Successfully...")
time.sleep(5)
post = 'https://mbasic.facebook.com/AJSTunisie/posts/6452145678144265'
#open the webpage
driver.get(post)
page = requests.get(post)
df_comm = pd.DataFrame(columns = ['post_url', 'user', 'commentaire', 'tag', 'user_url'])
page_count = 0
while (True ):
#scrape les reactions
tree = html.fromstring(driver.page_source)
user = tree.xpath("//div[#class='eb']/div/h3/a/text()")
commentaire = tree.xpath("//div[#class='eb']/div/div[1]/text()")
tag = tree.xpath("//div[#class='eb']/div/div[1]/a/text()")
user_url = tree.xpath("//div[#class='eb']/div/h3/a/#href")
data= {'post_url':[post]*len(user), 'user':user, 'commentaire':commentaire, 'tag':tag,
'user_url':user_url}
df_comm = df_comm.append(pd.DataFrame(columns = df_comm.columns,data=data))
#Check if more reaction exist ("En afficher davantage" existe ou pas)
next_link = tree.xpath("//div[#class='eb eu']/a/#href")
if len(next_link)!= 0:
driver.find_element_by_xpath("//div[#class='eb eu']/a/#href").click()
page_count = page_count+1
else :
next_link = ''
break
df_comm =df_comm.reset_index()
#df_comm.to_csv(path,index=False)
driver.close()
You should do it in different way.
First you should find all comments - elements with text, user, tag, etc. - and next you should use for-loop to work with every comment separatelly. If loop you should use relavite xpath (starting at .) to get only information for this single comment. And then you can see if you have missing tag or other item and you can put some default value - i.e. empty string.
This way every comment will have all values so every row in CSV will have the same size.
This way also resolve other problem - in previous method you could get first comment with tag from second comment and you couldn't control it.
To make code simpler I put every comment on list of rows and later I convert all to DataFrame.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import time
from lxml import html
import logging as log
import pandas as pd
#chemin de chrome et desactivation des adds automatique de FB anti scrape
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
#driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver.exe', chrome_options=chrome_options)
driver = webdriver.Chrome(chrome_options=chrome_options)
#open FB
driver.get("http://www.facebook.com")
print ("facebook page log ok")
time.sleep(1)
#reperage de user et pass (css_selector)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='pass']")))
##reperage de user et pass et click (xpath)
#username = driver.find_element(By.XPATH,"//input[contains(#id,'email')]")
#password = driver.find_element(By.XPATH,"//input[contains(#id,'pass')]")
usr = input('Enter Email Id:')
pwd = input('Enter Password:')
#enter données
username.clear()
username.send_keys(usr)
print ("Email Id entered")
#time.sleep(1)
password.clear()
password.send_keys(pwd)
print ("Pass entered")
#reperage bouton log in et click
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"button[type='submit']"))).click()
print("login Successfully...")
time.sleep(5)
post_url = 'https://mbasic.facebook.com/AJSTunisie/posts/6452145678144265'
#open the webpage
driver.get(post_url)
all_rows = []
page_count = 0
while True:
#scrape les reactions
page_count += 1
print('\n--- page:', page_count, '---\n')
tree = html.fromstring(driver.page_source)
# find all comments
all_comments = tree.xpath("//div[#class='ec']/div")
print('len(all_comments):', len(all_comments))
# work with every comment separatelly
for comment in all_comments:
user = comment.xpath(".//h3/a/text()") # relative xpath starting at `.`
print('user:', user)
user = user[0] if user else "" # set default value
print('user:', user)
commentaire = comment.xpath(".//div[1]/text()") # relative xpath starting at `.`
print('commentaire:', commentaire)
commentaire = commentaire[0] if commentaire else "" # set default value
print('commentaire:', commentaire)
tag = comment.xpath(".//div[1]/a/text()") # relative xpath starting at `.`
print('tag:', tag)
tag = tag[0] if tag else "" # set default value
print('tag:', tag)
user_url = comment.xpath(".//h3/a/#href") # relative xpath starting at `.`
print('user_url:', user_url)
user_url = user_url[0] if user_url else "" # set default value
print('user_url:', user_url)
all_rows.append([post_url, user, commentaire, tag, user_url])
#Check if more reaction exist ("En afficher davantage" existe ou pas)
next_link = driver.find_elements_by_xpath("//div[#class='ec es']/a")
print('---')
print('len(next_link):', len(next_link))
if next_link:
next_link[0].click()
time.sleep(2)
else:
break
# - after loop -
df = pd.DataFrame(all_rows, columns=['post_url', 'user', 'commentaire', 'tag', 'user_url'])
print(df)
df.to_csv('output.csv', index=False)
#driver.close()