I'm using the script below to retrieve property data for a college project. It is working without errors, but the dataframe has repeated values, that is, if I put it to fetch data from pages it repeats the same data from page 1 5 times, please help!
import requests, re, time, os, csv
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
# Inicializamos as listas para guardar as informações
link_imovel=[] # nesta lista iremos guardar a url
address=[] # nesta lista iremos guardar o endereço
neighbor=[] # nesta lista iremos guardar o bairro
anunciante=[] # nesta lista iremos guardar o anunciante
area=[] # nesta lista iremos guardar a area
tipo=[] # nesta lista iremos guardar o tipo de imóvel
room=[] # nesta lista iremos guardar a quantidade de quartos
bath=[] # nesta lista iremos guardar a quantidade de banheiros
park=[] # nesta lista iremos guardar a quantidade de vagas de garagem
price=[] # nesta lista iremos guardar o preço do imóvel
# Ele irá solicitar quantas páginas você deseja coletar
pages_number=int(input('How many pages? '))
# inicializa o tempo de execução
tic = time.time()
# Configure chromedriver
# para executar, é necessário que você baixe o chromedriver e deixe ele na mesma pasta de execução, ou mude o path
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
time.sleep(15)
# Criando o loop entre as paginas do site
for page in range(1,pages_number+1):
link = 'https://www.vivareal.com.br/venda/minas-gerais/pocos-de-caldas/casa_residencial/?pagina='+str(page)+''
driver.get(link)
# Definimos um sleep time para não sobrecarregar o site
# coletamos todas as informações da página e transformamos em formato legivel
data = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup_complete_source = BeautifulSoup(data.encode('utf-8'), "lxml")
# identificamos todos os itens de card de imóveis
soup = soup_complete_source.find(class_='results-list js-results-list')
# Web-Scraping
# para cada elemento no conjunto de cards, colete:
for line in soup.findAll(class_="js-card-selector"):
# colete o endereço completo e o bairro
try:
full_address=line.find(class_="property-card__address").text.strip()
address.append(full_address.replace('\n', '')) #Get all address
if full_address[:3]=='Rua' or full_address[:7]=='Avenida' or full_address[:8]=='Travessa' or full_address[:7]=='Alameda':
neighbor_first=full_address.strip().find('-')
neighbor_second=full_address.strip().find(',', neighbor_first)
if neighbor_second!=-1:
neighbor_text=full_address.strip()[neighbor_first+2:neighbor_second]
neighbor.append(neighbor_text) # Guarde na lista todos os bairros
else: # Bairro não encontrado
neighbor_text='-'
neighbor.append(neighbor_text) # Caso o bairro não seja encontrado
else:
get_comma=full_address.find(',')
if get_comma!=-1:
neighbor_text=full_address[:get_comma]
neighbor.append(neighbor_text) # Guarde na lista todos os bairros com problema de formatação provenientes do proprio website
else:
get_hif=full_address.find('-')
neighbor_text=full_address[:get_hif]
neighbor.append(neighbor_text)
# Coleta o link
full_link=line.find(class_='property-card__main-info').a.get('href')
link_imovel.append(full_link)
# Coleta o anunciante
full_anunciante=line.find(class_='property-card__account-link js-property-card-account-link').img.get('alt').title()
anunciante.append(full_anunciante)
# Coleta a área
full_area=line.find(class_="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area").text.strip()
area.append(full_area)
# Coleta tipologia
full_tipo = line.find(class_='property-card__title js-cardLink js-card-title').text.split()[0]
full_tipo=full_tipo.replace(' ','')
full_tipo=full_tipo.replace('\n','')
tipo.append(full_tipo)
# Coleta numero de quartos
full_room=line.find(class_="property-card__detail-item property-card__detail-room js-property-detail-rooms").text.strip()
full_room=full_room.replace(' ','')
full_room=full_room.replace('\n','')
full_room=full_room.replace('Quartos','')
full_room=full_room.replace('Quarto','')
room.append(full_room) #Get apto's rooms
# Coleta numero de banheiros
full_bath=line.find(class_="property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom").text.strip()
full_bath=full_bath.replace(' ','')
full_bath=full_bath.replace('\n','')
full_bath=full_bath.replace('Banheiros','')
full_bath=full_bath.replace('Banheiro','')
bath.append(full_bath) #Get apto's Bathrooms
# Coleta numero de vagas de garagem
full_park=line.find(class_="property-card__detail-item property-card__detail-garage js-property-detail-garages").text.strip()
full_park=full_park.replace(' ','')
full_park=full_park.replace('\n','')
full_park=full_park.replace('Vagas','')
full_park=full_park.replace('Vaga','')
park.append(full_park) #Get apto's parking lot
# Coleta preço
full_price=re.sub('[^0-9]','',line.find(class_="property-card__price js-property-card-prices js-property-card__price-small").text.strip())
price.append(full_price) #Get apto's parking lot
except:
continue
# fecha o chromedriver
driver.quit()
# cria um dataframe pandas e salva como um arquivo CSV
for i in range(0,len(neighbor)):
combinacao=[link_imovel[i],address[i],neighbor[i],anunciante[i],area[i],tipo[i],room[i],bath[i],park[i],price[i]]
df=pd.DataFrame(combinacao)
with open('VivaRealData.csv', 'a', encoding='utf-16', newline='') as f:
df.transpose().to_csv(f, encoding='iso-8859-1', header=False)
# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print('Finished in ' + str(get_time) + ' seconds')
print(str(len(price))+' results!')
it seems to me that the "for line in soup.findAll" never pops, I've tried everything but I always get data from the first page.
Indeed the URL does return the same results regardless of the page number requested. It also returns the same information if requests is used avoiding the huge overhead of using Selenium.
A better (and much faster) approach is to access all of the data directly from the site's JSON API.
The following shows you a possible starting point. All of the data is inside data, you just need to find the information you want inside it and access it. I suggest you print(data) and use a tool to format it better.
import requests, re, time, os, csv
# Ele irá solicitar quantas páginas você deseja coletar
#pages_number = int(input('How many pages? '))
pages_number = 5
# inicializa o tempo de execução
tic = time.time()
sess = requests.Session()
params = {
'addressCity' : 'Poços de Caldas',
'addressLocationId' : 'BR>Minas Gerais>NULL>Pocos de Caldas',
'addressNeighborhood' : '',
'addressState' : 'Minas Gerais',
'addressCountry' : 'Brasil',
'addressStreet' : '',
'addressZone' : '',
'addressPointLat' : '-21.7854',
'addressPointLon' : '-46.561934',
'business' : 'SALE',
'facets' : 'amenities',
'unitTypes' : 'HOME',
'unitSubTypes' : 'UnitSubType_NONE,SINGLE_STOREY_HOUSE,VILLAGE_HOUSE,KITNET',
'unitTypesV3' : 'HOME',
'usageTypes' : 'RESIDENTIAL',
'listingType' : 'USED',
'parentId' : 'null',
'categoryPage' : 'RESULT',
'includeFields' : 'search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount),page,seasonalCampaigns,fullUriFragments,nearby(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),expansion(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones,phones),developments(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),owners(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount))',
'size' : '100',
'from' : '144',
'q' : '',
'developmentsSize' : '5',
'__vt' : '',
'levels' : 'CITY,UNIT_TYPE',
'ref' : '/venda/minas-gerais/pocos-de-caldas/casa_residencial/',
'pointRadius' : '',
'isPOIQuery' : '',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'x-domain': 'www.vivareal.com.br',
}
results = 0
with open('VivaRealData.csv', 'w', newline='', encoding='utf-16') as f_output:
csv_output = csv.writer(f_output)
# Criando o loop entre as paginas do site
for page in range(pages_number+1):
print(f"Page {page+1}")
link = 'https://glue-api.vivareal.com/v2/listings'
params['from'] = f"{page * 100}"
req = sess.get(link, headers=headers, params=params)
data = req.json()
for listing in data['search']['result']['listings']:
href = listing['link']['href']
street = listing['listing']['address'].get('street', '').strip()
bedrooms = listing['listing']['bedrooms'][0]
bathrooms = listing['listing']['bathrooms'][0]
price = listing['listing']['pricingInfos'][0]['price']
row = [href, street, bedrooms, bathrooms, price]
csv_output.writerow(row)
results += 1
# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print(f'Finished in {get_time} seconds')
print(f'{results} results!')
For this example, it is hard coded to 5 pages and returns 593 results in about 6 seconds.
Using Pandas might be a bit overkill here as the data can be written a row at a time directly to your output CSV file.
How was this solved?
Your best friend here is your browser's network dev tools. With this you can watch the requests made to obtain the information. The normal process flow is the initial HTML page is downloaded, this runs the javascript and requests more data to further fill the page.
The trick is to first locate where the data you want is (often returned as JSON), then determine what you need to recreate the parameters needed to make the request for it.
Approaches using Selenium allow the javascript to work, but most times this is not needed as it is just making requests and formatting the data for display.
I am working on retrieving information from pages through web scraping, my code does not throw errors, but I have problems wanting to save this information in a kind of database. I leave my code to see if anyone can help me, I create the csv file, but at the moment, it saves absolutely nothing:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.request import urlopen
#MOTOR DE BUSQUEDA
#Rastreo de sitios a traves de la barra de búsqueda
class Content: #Pensamos las noticias como objetos o elementos, por ende creamos la clase Content para extraer el contenido del sitio "el ciudadano"
def __init__(self, topic, url, title, body, data): #se crea de esta forma una clase, donde tenemos el titulo, el cuerpo, la url y el topico de búsqueda.
self.topic = topic #leemos la palabra clave de búsqueda
self.title = title #leemos el titulo
self.body = body #leemos el cuerpo
self.url = url #leemos la URL
self.data = data #leemos la fecha de publicación
def print(self):
print("New article found for topic: {}".format(self.topic))
print("TITLE: {}".format(self.title))
print("BODY:\n{}".format(self.body))
print("URL: {}".format(self.url))
print("DATA: {}".format(self.data))
class Website: #Clase que guarda las propiedades del sitio
"""
Contiene informacion sobre la estructura del sitio web
"""
def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag, dataTag):
self.name = name
self.url = url
self.searchUrl = searchUrl #contiene al boton de busqueda
self.resultListing = resultListing #elemento que ve en resultado todas las listas encontradas
self.resultUrl = resultUrl #define la etiqueta donde se encuentra el link que queremos acceder
self.absoluteUrl=absoluteUrl #creamos el valor boleano para ver si la url es absoluta o relativa,
#absoluta = True relativa=False
self.titleTag = titleTag #etiqueta del titulo de la noticia
self.bodyTag = bodyTag #etiqueta del cuerpo de la noticia
self.dataTag = dataTag #etiqueta de la fecha de publicacion de la noticia
class Crawler: #Clase que toma la URL y devuelve el objeto BeautifulSoup
#Aqui trata las excepciones y posibles errores
def getPage(self, url): #funcion que toma una url y devuelve un objeto BeautifulSoup
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
#funcion de utilidad que nos encontrara los elementos dentro del BeautifulSoup
def safeGet(self, pageObj, selector):
childObj = pageObj.select(selector)
if childObj is not None and len(childObj) > 0:
#return childObj[0].get_text() solo entrega el primero
return '\n'.join(
[elem.get_text() for elem in childObj]) #entrega todos las busquedas
return "" #Si no pasa eso, retorna vacio o none.
def search(self, topic, site): #ingresamos un tópico y el sitio
"""
Busca en un sitio web determinado un tema determinado y registra todas las paginas
encontradas
"""
bs = self.getPage(site.searchUrl + topic) #recibe la URL del sitio con el tópico
searchResults = bs.select(site.resultListing) #definimos el objeto que contiene todos los resultados
registrocontenido = []
for result in searchResults: #como son varios, accedimos uno por uno
url = result.select(site.resultUrl)[0].attrs["href"] #el atributo href contiene los links
# Verifica si es una URL relativa o absoluta.
if(site.absoluteUrl):
bs = self.getPage(url) #si es absoluta
else:
bs = self.getPage(site.url + url) #si es relativa
if bs is None:
print("Tenemos un problema!!")
return
title = self.safeGet(bs, site.titleTag) #usamos la funcion de utilidad safeGet
body = self.safeGet(bs, site.bodyTag)
data = self.safeGet(bs, site.dataTag)
if title != '' and body != '':
#Si titulo y cuerpo son distintos de vacios, imprimos.
content = Content(topic, url, title, body, data)
content.print()
registrocontenido.append(content)
return registrocontenido
def writeArticles(filename, articles):
csvFile = open(filename,'wt+',encoding='utf-8')
writer=csv.writer(csvFile)
try:
for article in articles:
csvrow = [article,topic,article.title,article.data,article.body,article.url]
finally:
csvFile.close()
crawler = Crawler()
#siteData=[nombre, urlprincipal,url busqueda, etiquetaresultado,etiqueta título en lista de resultados,url absoluta,titulo de la noticia, cuerpo, fecha]
siteData = [['El ciudadano', 'https://www.elciudadano.com/', 'https://www.elciudadano.com/?s=', 'div.td_module_16 ','h3.entry-title a', True, 'h1.entry-title', 'div.td-post-content p', 'time']]
sites = []
for row in siteData:
sites.append(Website(row[0], row[1], row[2],row[3], row[4], row[5], row[6], row[7], row[8]))
topics = ['PYTHON'] #variable de topicos que queremos extraer
articles = []
for topic in topics:
print("GETTING INFO ABOUT: " + topic)
for targetSite in sites: #for para recorrer los sitios
articles.extend(crawler.search(topic, targetSite))
crawler.search(topic, targetSite) #llamamos a la funcion search
writeArticles('Articulos.csv', articles)
I appreciate any help or suggestion please!
In your method to save the content to csv, you are not writing to the file. the code just opens and closes the file pointer. Use writer.writerow()
def writeArticles(filename, articles):
csvFile = open(filename,'wt+',encoding='utf-8')
writer=csv.writer(csvFile)
try:
for article in articles:
writer.writerow([article,topic,article.title,article.data,article.body,article.url])
finally:
csvFile.close()
I know that the question has been asked but I think not in this specific situation. If it's the case feel free to show me the case.
I have a HTML file hierarchized (you can view the original here) that way :
<h5 id="foo1">Title 1</h5>
<table class="foo2">
<tbody>
<tr>
<td>
<h3 class="foo3">SomeName1</h3>
<img src="Somesource" alt="SomeName2" title="SomeTitle"><br>
<p class="textcode">
Some precious text here
</p>
</td>
...
</table>
I would like to extract the name, the image and the text contained in the <p> each table data in each h5 separately meaning I would like to save each one of these items in a separate folder named after the h5 therein.
I tried this :
# coding: utf-8
import os
import re
from bs4 import BeautifulSoup as bs
os.chdir("WorkingDirectory")
# Sélection du HTML et remplissage de son contenu dans la variable éponyme
with open("TheGoodPath.htm","r") as html:
html = bs(html,'html.parser')
# Sélection des hearders, restriction des résultats aux six premiers et création des dossiers
h5 = html.find_all("h5",limit=6)
for h in h5:
# Création des fichiers avec le nom des headers
chemin = u"../Résulat/"
nom = str(h.contents[0].string)
os.makedirs(chemin + nom,exist_ok=True)
# Sélection de la table soeur située juste après le header
table = h.find_next_sibling(name = 'table')
for t in table:
# Sélection des headers contenant les titres des documents
h3 = t.find_all("h3")
for k in h3:
titre = str(k.string)
# Création des répertoires avec les noms des figures
os.makedirs(chemin + nom + titre,exist_ok=True)
os.fdopen(titre.tex)
# Récupération de l'image située dans la balise soeur située juste après le header précédent
img = k.find_next_sibling("img")
chimg = img.img['src']
os.fdopen(img.img['title'])
# Récupération du code TikZ située dans la balise soeur située juste après le header précédent
tikz = k.find_next_sibling('p')
# Extraction du code TikZ contenu dans la balise précédemment récupérée
code = tikz.get_text()
# Définition puis écriture du préambule et du code nécessaire à la production de l'image précédemment enregistrée
preambule = r"%PREAMBULE \n \usepackage{pgfplots} \n \usepackage{tikz} \n \usepackage[european resistor, european voltage, european current]{circuitikz} \n \usetikzlibrary{arrows,shapes,positioning} \n \usetikzlibrary{decorations.markings,decorations.pathmorphing, decorations.pathreplacing} \n \usetikzlibrary{calc,patterns,shapes.geometric} \n %FIN PREAMBULE"
with open(chemin + nom + titre,'w') as result:
result.write(preambule + code)
But it prints AttributeError: 'NavigableString' object has no attribute 'find_next_element' for h3 = t.find_all("h3"), line 21
This seems to be what you want, there only seems to be one table between each h5 so don't iterate over it just use find_next and use the table returned:
from bs4 import BeautifulSoup
import requests
cont = requests.get("http://www.physagreg.fr/schemas-figures-physique-svg-tikz.php").text
soup = BeautifulSoup(cont)
h5s = soup.find_all("h5",limit=6)
for h5 in h5s:
# find first table after
table = h5.find_next("table")
# find all h3 elements in that table
for h3 in table.select("h3"):
print(h3.text)
img = h3.find_next("img")
print(img["src"])
print(img["title"])
print(img.find_next("p").text)
print()
Which gives you output like:
repere-plan.svg
\begin{tikzpicture}[scale=1]
\draw (0,0) --++ (1,1) --++ (3,0) --++ (-1,-1) --++ (-3,0);
\draw [thick] [->] (2,0.5) --++(0,2) node [right] {z};
%thick : gras ; very thick : très gras ; ultra thick : hyper gras
\draw (2,0.5) node [left] {O};
\draw [thick] [->] (2,0.5) --++(-1,-1) node [left] {x};
\draw [thick] [->] (2,0.5) --++(2,0) node [below] {y};
\end{tikzpicture}
Lignes de champ et équipotentielles
images/cours-licence/em3/ligne-champ-equipot.svg
ligne-champ-equipot.svg
\begin{tikzpicture}[scale=0.8]
\draw[->] (-2,0) -- (2,0);
\draw[->] (0,-2) -- (0,2);
\draw node [red] at (-2,1.25) {\scriptsize{Lignes de champ}};
\draw node [blue] at (2,-1.25) {\scriptsize{Equipotentielles}};
\draw[color=red,domain=-3.14:3.14,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={3*sin(\x r)*3*sin(\x r)*5});
%r = angle en radian
%domain permet de définir le domaine dans lequel la fonction sera tracée
%samples=200 permet d'augmenter le nombre de points pour le tracé
%smooth améliore également la qualité de la trace
\draw[color=red,domain=-3.14:3.14,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={2*sin(\x r)*2*sin(\x r)*5});
\draw[color=blue,domain=-pi:pi,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={3*sqrt(abs(cos(\x r)))*15});
\draw[color=blue,domain=-pi:pi,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={2*sqrt(abs(cos(\x r)))*15});
\end{tikzpicture}
Fonction arctangente
images/schemas/math/arctan.svg
arctan.svg
\begin{tikzpicture}[scale=0.8]
\draw[very thin,color=gray] (-pi,pi) grid (-pi,pi);
\draw[->] (-pi,0) -- (pi,0) node[right] {$x$};
\draw[->] (0,-2) -- (0,2);
\draw[color=red,domain=-pi:pi,samples=150] plot ({\x},{rad(atan(\x))} )node[right,red] {$\arctan(x)$};
\draw[color=blue,domain=-pi:pi] plot ({\x},{rad(-atan(\x))} )node[right,blue] {$-\arctan(x)$};
%Le rad() est une autre façon de dire que l'argument est en radian
\end{tikzpicture}
To write all the .svg's to disk:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os import path
cont = requests.get("http://www.physagreg.fr/schemas-figures-physique-svg-tikz.php").text
soup = BeautifulSoup(cont)
base_url = "http://www.physagreg.fr/"
h5s = soup.find_all("h5", limit=6)
for h5 in h5s:
# find first table after
table = h5.find_next("table")
# find all h3 elements in that table
for h3 in table.select("h3"):
print(h3.text)
img = h3.find_next("img")
src, title = img["src"], img["title"]
# join base url and image url
img_url = urljoin(base_url, src)
# open file using title as file name
with open(title, "w") as f:
# requests the img url and write content
f.write(requests.get(img_url).content)
Which will give you arctan.svg courbe-Epeff.svg and all the rest on the page etc..
It looks like (judging by the for t in table loop) you meant to find multiple "table" elements. Use find_next_siblings() instead of find_next_sibling():
table = h.find_next_siblings(name='table')
for t in table: