WebScraping in Python returning repeated data - python

I'm using the script below to retrieve property data for a college project. It is working without errors, but the dataframe has repeated values, that is, if I put it to fetch data from pages it repeats the same data from page 1 5 times, please help!
import requests, re, time, os, csv
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
# Inicializamos as listas para guardar as informações
link_imovel=[] # nesta lista iremos guardar a url
address=[] # nesta lista iremos guardar o endereço
neighbor=[] # nesta lista iremos guardar o bairro
anunciante=[] # nesta lista iremos guardar o anunciante
area=[] # nesta lista iremos guardar a area
tipo=[] # nesta lista iremos guardar o tipo de imóvel
room=[] # nesta lista iremos guardar a quantidade de quartos
bath=[] # nesta lista iremos guardar a quantidade de banheiros
park=[] # nesta lista iremos guardar a quantidade de vagas de garagem
price=[] # nesta lista iremos guardar o preço do imóvel
# Ele irá solicitar quantas páginas você deseja coletar
pages_number=int(input('How many pages? '))
# inicializa o tempo de execução
tic = time.time()
# Configure chromedriver
# para executar, é necessário que você baixe o chromedriver e deixe ele na mesma pasta de execução, ou mude o path
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
time.sleep(15)
# Criando o loop entre as paginas do site
for page in range(1,pages_number+1):
link = 'https://www.vivareal.com.br/venda/minas-gerais/pocos-de-caldas/casa_residencial/?pagina='+str(page)+''
driver.get(link)
# Definimos um sleep time para não sobrecarregar o site
# coletamos todas as informações da página e transformamos em formato legivel
data = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup_complete_source = BeautifulSoup(data.encode('utf-8'), "lxml")
# identificamos todos os itens de card de imóveis
soup = soup_complete_source.find(class_='results-list js-results-list')
# Web-Scraping
# para cada elemento no conjunto de cards, colete:
for line in soup.findAll(class_="js-card-selector"):
# colete o endereço completo e o bairro
try:
full_address=line.find(class_="property-card__address").text.strip()
address.append(full_address.replace('\n', '')) #Get all address
if full_address[:3]=='Rua' or full_address[:7]=='Avenida' or full_address[:8]=='Travessa' or full_address[:7]=='Alameda':
neighbor_first=full_address.strip().find('-')
neighbor_second=full_address.strip().find(',', neighbor_first)
if neighbor_second!=-1:
neighbor_text=full_address.strip()[neighbor_first+2:neighbor_second]
neighbor.append(neighbor_text) # Guarde na lista todos os bairros
else: # Bairro não encontrado
neighbor_text='-'
neighbor.append(neighbor_text) # Caso o bairro não seja encontrado
else:
get_comma=full_address.find(',')
if get_comma!=-1:
neighbor_text=full_address[:get_comma]
neighbor.append(neighbor_text) # Guarde na lista todos os bairros com problema de formatação provenientes do proprio website
else:
get_hif=full_address.find('-')
neighbor_text=full_address[:get_hif]
neighbor.append(neighbor_text)
# Coleta o link
full_link=line.find(class_='property-card__main-info').a.get('href')
link_imovel.append(full_link)
# Coleta o anunciante
full_anunciante=line.find(class_='property-card__account-link js-property-card-account-link').img.get('alt').title()
anunciante.append(full_anunciante)
# Coleta a área
full_area=line.find(class_="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area").text.strip()
area.append(full_area)
# Coleta tipologia
full_tipo = line.find(class_='property-card__title js-cardLink js-card-title').text.split()[0]
full_tipo=full_tipo.replace(' ','')
full_tipo=full_tipo.replace('\n','')
tipo.append(full_tipo)
# Coleta numero de quartos
full_room=line.find(class_="property-card__detail-item property-card__detail-room js-property-detail-rooms").text.strip()
full_room=full_room.replace(' ','')
full_room=full_room.replace('\n','')
full_room=full_room.replace('Quartos','')
full_room=full_room.replace('Quarto','')
room.append(full_room) #Get apto's rooms
# Coleta numero de banheiros
full_bath=line.find(class_="property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom").text.strip()
full_bath=full_bath.replace(' ','')
full_bath=full_bath.replace('\n','')
full_bath=full_bath.replace('Banheiros','')
full_bath=full_bath.replace('Banheiro','')
bath.append(full_bath) #Get apto's Bathrooms
# Coleta numero de vagas de garagem
full_park=line.find(class_="property-card__detail-item property-card__detail-garage js-property-detail-garages").text.strip()
full_park=full_park.replace(' ','')
full_park=full_park.replace('\n','')
full_park=full_park.replace('Vagas','')
full_park=full_park.replace('Vaga','')
park.append(full_park) #Get apto's parking lot
# Coleta preço
full_price=re.sub('[^0-9]','',line.find(class_="property-card__price js-property-card-prices js-property-card__price-small").text.strip())
price.append(full_price) #Get apto's parking lot
except:
continue
# fecha o chromedriver
driver.quit()
# cria um dataframe pandas e salva como um arquivo CSV
for i in range(0,len(neighbor)):
combinacao=[link_imovel[i],address[i],neighbor[i],anunciante[i],area[i],tipo[i],room[i],bath[i],park[i],price[i]]
df=pd.DataFrame(combinacao)
with open('VivaRealData.csv', 'a', encoding='utf-16', newline='') as f:
df.transpose().to_csv(f, encoding='iso-8859-1', header=False)
# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print('Finished in ' + str(get_time) + ' seconds')
print(str(len(price))+' results!')
it seems to me that the "for line in soup.findAll" never pops, I've tried everything but I always get data from the first page.

Indeed the URL does return the same results regardless of the page number requested. It also returns the same information if requests is used avoiding the huge overhead of using Selenium.
A better (and much faster) approach is to access all of the data directly from the site's JSON API.
The following shows you a possible starting point. All of the data is inside data, you just need to find the information you want inside it and access it. I suggest you print(data) and use a tool to format it better.
import requests, re, time, os, csv
# Ele irá solicitar quantas páginas você deseja coletar
#pages_number = int(input('How many pages? '))
pages_number = 5
# inicializa o tempo de execução
tic = time.time()
sess = requests.Session()
params = {
'addressCity' : 'Poços de Caldas',
'addressLocationId' : 'BR>Minas Gerais>NULL>Pocos de Caldas',
'addressNeighborhood' : '',
'addressState' : 'Minas Gerais',
'addressCountry' : 'Brasil',
'addressStreet' : '',
'addressZone' : '',
'addressPointLat' : '-21.7854',
'addressPointLon' : '-46.561934',
'business' : 'SALE',
'facets' : 'amenities',
'unitTypes' : 'HOME',
'unitSubTypes' : 'UnitSubType_NONE,SINGLE_STOREY_HOUSE,VILLAGE_HOUSE,KITNET',
'unitTypesV3' : 'HOME',
'usageTypes' : 'RESIDENTIAL',
'listingType' : 'USED',
'parentId' : 'null',
'categoryPage' : 'RESULT',
'includeFields' : 'search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount),page,seasonalCampaigns,fullUriFragments,nearby(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),expansion(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones,phones),developments(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),owners(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount))',
'size' : '100',
'from' : '144',
'q' : '',
'developmentsSize' : '5',
'__vt' : '',
'levels' : 'CITY,UNIT_TYPE',
'ref' : '/venda/minas-gerais/pocos-de-caldas/casa_residencial/',
'pointRadius' : '',
'isPOIQuery' : '',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'x-domain': 'www.vivareal.com.br',
}
results = 0
with open('VivaRealData.csv', 'w', newline='', encoding='utf-16') as f_output:
csv_output = csv.writer(f_output)
# Criando o loop entre as paginas do site
for page in range(pages_number+1):
print(f"Page {page+1}")
link = 'https://glue-api.vivareal.com/v2/listings'
params['from'] = f"{page * 100}"
req = sess.get(link, headers=headers, params=params)
data = req.json()
for listing in data['search']['result']['listings']:
href = listing['link']['href']
street = listing['listing']['address'].get('street', '').strip()
bedrooms = listing['listing']['bedrooms'][0]
bathrooms = listing['listing']['bathrooms'][0]
price = listing['listing']['pricingInfos'][0]['price']
row = [href, street, bedrooms, bathrooms, price]
csv_output.writerow(row)
results += 1
# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print(f'Finished in {get_time} seconds')
print(f'{results} results!')
For this example, it is hard coded to 5 pages and returns 593 results in about 6 seconds.
Using Pandas might be a bit overkill here as the data can be written a row at a time directly to your output CSV file.
How was this solved?
Your best friend here is your browser's network dev tools. With this you can watch the requests made to obtain the information. The normal process flow is the initial HTML page is downloaded, this runs the javascript and requests more data to further fill the page.
The trick is to first locate where the data you want is (often returned as JSON), then determine what you need to recreate the parameters needed to make the request for it.
Approaches using Selenium allow the javascript to work, but most times this is not needed as it is just making requests and formatting the data for display.

Related

While loop to get information from HTML code in python

Trying to create a code that will get reviewer's name and reviews from Booking.com.
I was able to get all the necessary URLs and isolate reviewer's name and comments from the HTML code but I'm struggling to create a while to go to the next review.
The while loop should take the reviewer's name append it to the list, move to the next name append it and so forth. I also need to the same for the comment.
When running the code nothing happens and I'm not sure where my issue is.
#Loop parameters
##HTMLs
#Booking.com URL
search_url[0] = 'https://www.booking.com/reviews/us/hotel/shore-cliff.es.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGiTAogBAZgBCrgBF8gBDNgBA-gBAYgCAagCA7gC5bPZkQbAAgHSAiQzMTc3NTA4OS00OGRkLTQ5ZjYtYjBhNi1kOWEzYzZhN2QwOWXYAgTgAgE;sid=3e3ae22b47e3df3ac2590eb19d37f888;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=1;r_lang=all;rows=75&'
link = search_urls[0] #Just the first one to try
url = link
html = urllib.request.urlopen(url).read().decode('utf-8') #loading each search page
#Main HTML of first hotel
index=html.find('class="review_list"')
review_list_html = html[index:]
##Lists:
hotels=[]
reviewer_name=[]
review_comment=[]
#Creating counter variable
counter=0
reviewercount =0
#Main HTML of first hotel
index=html.find('class="review_list"')
review_list_html = html[index:]
reviewer_html = review_list_html[review_list_html.find('reviewer_name'):]
review_html = review_list_html[review_list_html.find('class="review_pos ">'):]
#Loop to get reviewer
while review_list_html.find('reviewer_name'):
#Get reviewer's name
#Start of reviewers name
start =reviewer_html.find('<span itemprop="name">')+22 #To ignore <span itemprop="name"> and jump right the name
start
#End of reviewers name
end =reviewer_html.find('</span>')
#Isolating reviewers name
reviewer_html=reviewer_html[start:end]
#Adding reviewer to list
reviewer_name.append(reviewer_html)
Your issue is that every next index lookup you need to start from previous index, otherwise you will create eternal loop. Generally it's more common to use HTML parsers like Beautiful Soup, but it's absolutely possible to parse this page with method you're trying to use.
We can use "reviewer_name" as main index for every review block. Starting from this index we will get indexes of "name" and </span>. Text between those indexes is reviewer's name. To parse review body we will find all indexes of "reviewBody" before index of next review block.
Full code:
from urllib.request import urlopen
link = "https://www.booking.com/reviews/us/hotel/shore-cliff.es.html"
with urlopen(link) as request:
response = request.read().decode()
reviews = []
name_pos = response.find('"reviewer_name"') # find first review
while name_pos >= 0:
name = ""
review_blocks = []
start_pos = response.find('"name"', name_pos)
end_pos = response.find("</span>", start_pos)
if end_pos > start_pos >= 0:
name = response[start_pos + 7: end_pos]
prev_name_pos = name_pos
name_pos = response.find('"reviewer_name"', name_pos + 1) # get next review
start_pos = response.find('"reviewBody"', prev_name_pos, name_pos)
while start_pos >= 0:
end_pos = response.find("</span>", start_pos)
if end_pos > start_pos >= 0:
review_blocks.append(response[start_pos + 13: end_pos])
start_pos = response.find('"reviewBody"', start_pos + 1, name_pos)
reviews.append((name, "\n".join(review_blocks)))
reviews content:
[
('Adriana',
'Nada para criticar.\n'
'Impecable lugar, habitación con vistas hermosas cualquiera sea. Camas '
'confortables, pequeña cocina completa, todo impecable.\n'
'La atención en recepción excelente, no se pierdan las cookies que convidan '
'por la tarde allí. El desayuno variado y con unos tamales exquisitos! Cerca '
'de todo.'),
('Ana', 'Todo excelente'),
('Lara',
'simplemente un poco de ruido en el tercer piso pero solo fue un poco antes '
'de las 10:00pm\n'
'realmente todo estaba excelente, ese gran detalle de el desayuno se les '
'agradece mucho.'),
('Rodrigo',
'Todo me gustó solo lo único que me hubiera gustado que también tuvieran es '
'unas chimeneas.\n'
'El hotel tiene una hermosa vista y se puede caminar y disfrutar por toda la '
'orilla de la playa hasta llegar al muelle y mas lejos si uno quiere.'),
('May', 'Me encanto q estaba abierta la piscina 👍🌊el mar expectacular'),
('Scq', 'Las vistas al Pacífico'),
('Eva', 'Desayuno\nUbicación y limpieza'),
('Marta',
'Muy buena ubicación y vistas al mar. Habitaciones modernas, amplias y con '
'cocina. Buen desayuno y hasta las 10, a diferencia de otros hoteles en los '
'que estuvimos. Personal muy amable. El chek out es a las 12 por lo que te '
'permite disfrutar de las piscina y de las vistas y paseo por la costa.'),
('Filippo',
'Habitación enorme, y muy limpio. \n'
'La habitación con vista al Ocean .... top'),
('Enrique', 'La atención del personal'),
('Lucia',
'El lugar para el desayuno es demasiado pequeño y no hay lugar suficiente '
'para sentarse\n'
'La vista, los jardines y todo el entorno son preciosos. Y es muy '
'confortable!'),
('Pablo', 'El precio.\nLa ubicación y el desayuno'),
('Walter', 'El hotel está bien, la ubicación es buena'),
('Anónimo', 'Muy bueno, el personal muy amable\nExcelente lugar muy cómodo'),
('Gonzalo', ''),
('Maria', ''),
('Rosana', ''),
('Leticia', ''),
('María', ''),
('Samantha', '')
]

How to edit Python code to loop the request to extract information from list

I'm only a couple of weeks into learning Python, and I'm trying to extract specific info from a list (events). I've been able to call the list and extract specific lines (info for single event), but the objective is running the program and extracting the information from the entirety of the called list (info from all of the events).
Among others, my best guesses so far have been along the lines of:
one_a_tag = soup.findAll('a')[22:85]
and
one_a_tag = soup.findAll('a')[22+1]
But I come up with these errors:
TypeError Traceback (most recent call last)
<ipython-input-15-ee19539fbb00> in <module>
11 soup.findAll('a')
12 one_a_tag = soup.findAll('a')[22:85]
---> 13 link = one_a_tag['href']
14 'https://arema.mx' + link
15 eventUrl = ('https://arema.mx' + link)
TypeError: list indices must be integers or slices, not str
And
TypeError Traceback (most recent call last)
<ipython-input-22-81d98bcf8fd8> in <module>
10 soup
11 soup.findAll('a')
---> 12 one_a_tag = soup.findAll('a')[22]+1
13 link = one_a_tag['href']
14 'https://arema.mx' + link
TypeError: unsupported operand type(s) for +: 'Tag' and 'int'
This is the entire code so far:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
url = 'https://arema.mx/'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
soup
soup.findAll('a')
one_a_tag = soup.findAll('a')[22]
link = one_a_tag['href']
'https://arema.mx' + link
eventUrl = ('https://arema.mx' + link)
print(eventUrl)
def getAremaTitulo(eventUrl):
res = requests.get(eventUrl)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
elems = soup.select('body > div.body > div.ar.eventname')
return elems[0].text.strip()
def getAremaInfo(eventUrl):
res = requests.get(eventUrl)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
elems = soup.select('body > div.body > div.event-header')
return elems[0].text.strip()
titulo = getAremaTitulo(eventUrl)
print('Nombre de evento: ' + titulo)
info = getAremaInfo(eventUrl)
print('Info: ' + info)
time.sleep(1)
I'm sure there may be some redundancies in the code, but what I'm most keen on solving is creating a loop to extract the specific info I'm looking for from all of the events. What do I need to add to get there?
Thanks!
To get all information about events, you can use this script:
import requests
from bs4 import BeautifulSoup
url = 'https://arema.mx/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for event_link in soup.select('#events a.event'):
u = 'https://arema.mx' + event_link['href']
s = BeautifulSoup(requests.get(u).content, 'html.parser')
event_name = s.select_one('.eventname').get_text(strip=True)
event_info = s.select_one('.event-header').text.strip()
print(event_name)
print(event_info)
print('-' * 80)
Prints:
...
--------------------------------------------------------------------------------
NOCHE BOHEMIA <A PIANO Y GUITARRA>
"Freddy González y Víctor Freez dos amigos que al
paso del tiempo hermanaron sus talentos para crear un concepto musical cálido y
acústico entre cuerdas y teclas haciéndonos vibrar entre una línea de canciones
de ayer y hoy. Rescatando las bohemias que tantos recuerdos y encuentros nos han
generado a lo largo del tiempo.
 Precio: $69*ya incluye cargo de servicio.Fecha: Sábado 15 de agosto 20:00 hrsTransmisión en vivo por Arema LiveComo ingresar a ver la presentación.·         Dale clic en Comprar  y Selecciona tu acceso.·         Elije la forma de pago que más se te facilite y finaliza la compra.·         Te llegara un correo electrónico con la confirmación de compra y un liga exclusiva para ingresar a la transmisión el día seleccionado únicamente.La compra de tu boleto es un apoyo para el artista.Importante:  favor de revisar tu correo en bandeja de entrada, no deseados o spam ya que los correos en ocasiones son enviados a esas carpetas.
--------------------------------------------------------------------------------
...

Problem with save in csv information of web scraping

I am working on retrieving information from pages through web scraping, my code does not throw errors, but I have problems wanting to save this information in a kind of database. I leave my code to see if anyone can help me, I create the csv file, but at the moment, it saves absolutely nothing:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.request import urlopen
#MOTOR DE BUSQUEDA
#Rastreo de sitios a traves de la barra de búsqueda
class Content: #Pensamos las noticias como objetos o elementos, por ende creamos la clase Content para extraer el contenido del sitio "el ciudadano"
def __init__(self, topic, url, title, body, data): #se crea de esta forma una clase, donde tenemos el titulo, el cuerpo, la url y el topico de búsqueda.
self.topic = topic #leemos la palabra clave de búsqueda
self.title = title #leemos el titulo
self.body = body #leemos el cuerpo
self.url = url #leemos la URL
self.data = data #leemos la fecha de publicación
def print(self):
print("New article found for topic: {}".format(self.topic))
print("TITLE: {}".format(self.title))
print("BODY:\n{}".format(self.body))
print("URL: {}".format(self.url))
print("DATA: {}".format(self.data))
class Website: #Clase que guarda las propiedades del sitio
"""
Contiene informacion sobre la estructura del sitio web
"""
def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag, dataTag):
self.name = name
self.url = url
self.searchUrl = searchUrl #contiene al boton de busqueda
self.resultListing = resultListing #elemento que ve en resultado todas las listas encontradas
self.resultUrl = resultUrl #define la etiqueta donde se encuentra el link que queremos acceder
self.absoluteUrl=absoluteUrl #creamos el valor boleano para ver si la url es absoluta o relativa,
#absoluta = True relativa=False
self.titleTag = titleTag #etiqueta del titulo de la noticia
self.bodyTag = bodyTag #etiqueta del cuerpo de la noticia
self.dataTag = dataTag #etiqueta de la fecha de publicacion de la noticia
class Crawler: #Clase que toma la URL y devuelve el objeto BeautifulSoup
#Aqui trata las excepciones y posibles errores
def getPage(self, url): #funcion que toma una url y devuelve un objeto BeautifulSoup
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
#funcion de utilidad que nos encontrara los elementos dentro del BeautifulSoup
def safeGet(self, pageObj, selector):
childObj = pageObj.select(selector)
if childObj is not None and len(childObj) > 0:
#return childObj[0].get_text() solo entrega el primero
return '\n'.join(
[elem.get_text() for elem in childObj]) #entrega todos las busquedas
return "" #Si no pasa eso, retorna vacio o none.
def search(self, topic, site): #ingresamos un tópico y el sitio
"""
Busca en un sitio web determinado un tema determinado y registra todas las paginas
encontradas
"""
bs = self.getPage(site.searchUrl + topic) #recibe la URL del sitio con el tópico
searchResults = bs.select(site.resultListing) #definimos el objeto que contiene todos los resultados
registrocontenido = []
for result in searchResults: #como son varios, accedimos uno por uno
url = result.select(site.resultUrl)[0].attrs["href"] #el atributo href contiene los links
# Verifica si es una URL relativa o absoluta.
if(site.absoluteUrl):
bs = self.getPage(url) #si es absoluta
else:
bs = self.getPage(site.url + url) #si es relativa
if bs is None:
print("Tenemos un problema!!")
return
title = self.safeGet(bs, site.titleTag) #usamos la funcion de utilidad safeGet
body = self.safeGet(bs, site.bodyTag)
data = self.safeGet(bs, site.dataTag)
if title != '' and body != '':
#Si titulo y cuerpo son distintos de vacios, imprimos.
content = Content(topic, url, title, body, data)
content.print()
registrocontenido.append(content)
return registrocontenido
def writeArticles(filename, articles):
csvFile = open(filename,'wt+',encoding='utf-8')
writer=csv.writer(csvFile)
try:
for article in articles:
csvrow = [article,topic,article.title,article.data,article.body,article.url]
finally:
csvFile.close()
crawler = Crawler()
#siteData=[nombre, urlprincipal,url busqueda, etiquetaresultado,etiqueta título en lista de resultados,url absoluta,titulo de la noticia, cuerpo, fecha]
siteData = [['El ciudadano', 'https://www.elciudadano.com/', 'https://www.elciudadano.com/?s=', 'div.td_module_16 ','h3.entry-title a', True, 'h1.entry-title', 'div.td-post-content p', 'time']]
sites = []
for row in siteData:
sites.append(Website(row[0], row[1], row[2],row[3], row[4], row[5], row[6], row[7], row[8]))
topics = ['PYTHON'] #variable de topicos que queremos extraer
articles = []
for topic in topics:
print("GETTING INFO ABOUT: " + topic)
for targetSite in sites: #for para recorrer los sitios
articles.extend(crawler.search(topic, targetSite))
crawler.search(topic, targetSite) #llamamos a la funcion search
writeArticles('Articulos.csv', articles)
I appreciate any help or suggestion please!
In your method to save the content to csv, you are not writing to the file. the code just opens and closes the file pointer. Use writer.writerow()
def writeArticles(filename, articles):
csvFile = open(filename,'wt+',encoding='utf-8')
writer=csv.writer(csvFile)
try:
for article in articles:
writer.writerow([article,topic,article.title,article.data,article.body,article.url])
finally:
csvFile.close()

Converting .htm to .txt with Python3

Does anyone know if I can optimize the code below, especially the for part?
I have several files in .htm format and I am specifying to read the files and generate a large .txt file. But it's taking too long. Is there any way to optimize this code?
Below is the code:
##### Importando Bibliotecas
from bs4 import BeautifulSoup
import urllib.request
import os
##### Lendo arquivos na pasta e salvando os nomes em arquivos
os.chdir('C:\\Users\\US365NR\\Desktop\\PROJETO OI\\PEDIDOS_DEBORA\\RAZOES\\PARTE_2')
arquivos = os.listdir()
##### Criando um documento txt unificado e abrindo.
filename = 'UNIFICADO.txt'
file = open(filename, 'w')
##### Criando uma iteracao para ler todos os arquivos na pasta arquivos.
for name in arquivos:
nfLink = 'file:///C:/Users/US365NR/Desktop/PROJETO%20OI/PEDIDOS_DEBORA/RAZOES//PARTE_2//' + name
print('TRABALHANDO NO ARQUIVO:')
print(name)
##### Lendo o arquivo htm com o BeautifulSoup
c=urllib.request.urlopen(nfLink)
soup=c.read()
soup = BeautifulSoup(soup)
print('TERMINOU DE LER BEAUTIFUL SOUP')
##### Para ter controle do que esta acontecendo
N_LINHAS = 0
LINHAS = []
N_TABLE = 0
TABELAS = []
tables = soup.findAll('table') ##### Encontrando todas as tabelas
N_TABLE = len(tables)
for table in tables: ##### Para cada tabela, quero ler as linhas
rows = table.findAll('tr')[1:]
N_LINHAS += len(rows)
for tr in rows: ##### Encontrando as colunas
cols = tr.findAll('td')
for i in range(0, len(cols)): #####Salvando as informacoes no arquivo txt
a = cols[i].text.replace('--*', '').replace('\n','') + '|'
file.write(a)
file.write('\n') ##### Proxima linha
LINHAS.append(N_LINHAS)
TABELAS.append(N_TABLE)
##### Print's de controle
print('TOTAL DE LINHAS', LINHAS)
print('TOTAL DE TABELAS', TABELAS)
print('FIM DO TRABALHO NO ARQUVO:')
print(name)
print('\n')

Looking for child content with Beautifulsoup

I am trying to scrape a phrase/author from the body of a URL. I can scrape the phrases but I don't know how to find the author and print it together with the phrase. Can you help me?
import urllib.request
from bs4 import BeautifulSoup
page_url = "https://www.pensador.com/frases/"
page = urllib.request.urlopen(page_url)
soup = BeautifulSoup(page, "html.parser")
for frase in soup.find_all("p", attrs={'class': 'frase fr'}):
print(frase.text + '\n')
# author = soup.find_all("span", attrs={'class': 'autor'})
# print(author.text)
# this is the author that I need, for each phrase the right author
You can get to the parent of the p.frase.fr tag, which is a div, and get the author by selecting span.autor descending the div:
In [1268]: for phrase in soup.select('p.frase.fr'):
...: author = phrase.parent.select_one('span.autor')
...: print(author.text.strip(), ': ', phrase.text.strip())
...:
Roberto Shinyashiki : Tudo o que um sonho precisa para ser realizado é alguém que acredite que ele possa ser realizado.
Paulo Coelho : Imagine uma nova história para sua vida e acredite nela.
Carlos Drummond de Andrade : Ser feliz sem motivo é a mais autêntica forma de felicidade.
...
...
Here, I'm using the CSS selector by phrase.parent.select_one('span.autor'), you can obviously use find here:
phrase.parent.find('span', attrs={'class': 'autor'})

Categories