Converting .htm to .txt with Python3 - python

Does anyone know if I can optimize the code below, especially the for part?
I have several files in .htm format and I am specifying to read the files and generate a large .txt file. But it's taking too long. Is there any way to optimize this code?
Below is the code:
##### Importando Bibliotecas
from bs4 import BeautifulSoup
import urllib.request
import os
##### Lendo arquivos na pasta e salvando os nomes em arquivos
os.chdir('C:\\Users\\US365NR\\Desktop\\PROJETO OI\\PEDIDOS_DEBORA\\RAZOES\\PARTE_2')
arquivos = os.listdir()
##### Criando um documento txt unificado e abrindo.
filename = 'UNIFICADO.txt'
file = open(filename, 'w')
##### Criando uma iteracao para ler todos os arquivos na pasta arquivos.
for name in arquivos:
nfLink = 'file:///C:/Users/US365NR/Desktop/PROJETO%20OI/PEDIDOS_DEBORA/RAZOES//PARTE_2//' + name
print('TRABALHANDO NO ARQUIVO:')
print(name)
##### Lendo o arquivo htm com o BeautifulSoup
c=urllib.request.urlopen(nfLink)
soup=c.read()
soup = BeautifulSoup(soup)
print('TERMINOU DE LER BEAUTIFUL SOUP')
##### Para ter controle do que esta acontecendo
N_LINHAS = 0
LINHAS = []
N_TABLE = 0
TABELAS = []
tables = soup.findAll('table') ##### Encontrando todas as tabelas
N_TABLE = len(tables)
for table in tables: ##### Para cada tabela, quero ler as linhas
rows = table.findAll('tr')[1:]
N_LINHAS += len(rows)
for tr in rows: ##### Encontrando as colunas
cols = tr.findAll('td')
for i in range(0, len(cols)): #####Salvando as informacoes no arquivo txt
a = cols[i].text.replace('--*', '').replace('\n','') + '|'
file.write(a)
file.write('\n') ##### Proxima linha
LINHAS.append(N_LINHAS)
TABELAS.append(N_TABLE)
##### Print's de controle
print('TOTAL DE LINHAS', LINHAS)
print('TOTAL DE TABELAS', TABELAS)
print('FIM DO TRABALHO NO ARQUVO:')
print(name)
print('\n')

Related

WebScraping in Python returning repeated data

I'm using the script below to retrieve property data for a college project. It is working without errors, but the dataframe has repeated values, that is, if I put it to fetch data from pages it repeats the same data from page 1 5 times, please help!
import requests, re, time, os, csv
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
# Inicializamos as listas para guardar as informações
link_imovel=[] # nesta lista iremos guardar a url
address=[] # nesta lista iremos guardar o endereço
neighbor=[] # nesta lista iremos guardar o bairro
anunciante=[] # nesta lista iremos guardar o anunciante
area=[] # nesta lista iremos guardar a area
tipo=[] # nesta lista iremos guardar o tipo de imóvel
room=[] # nesta lista iremos guardar a quantidade de quartos
bath=[] # nesta lista iremos guardar a quantidade de banheiros
park=[] # nesta lista iremos guardar a quantidade de vagas de garagem
price=[] # nesta lista iremos guardar o preço do imóvel
# Ele irá solicitar quantas páginas você deseja coletar
pages_number=int(input('How many pages? '))
# inicializa o tempo de execução
tic = time.time()
# Configure chromedriver
# para executar, é necessário que você baixe o chromedriver e deixe ele na mesma pasta de execução, ou mude o path
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
time.sleep(15)
# Criando o loop entre as paginas do site
for page in range(1,pages_number+1):
link = 'https://www.vivareal.com.br/venda/minas-gerais/pocos-de-caldas/casa_residencial/?pagina='+str(page)+''
driver.get(link)
# Definimos um sleep time para não sobrecarregar o site
# coletamos todas as informações da página e transformamos em formato legivel
data = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup_complete_source = BeautifulSoup(data.encode('utf-8'), "lxml")
# identificamos todos os itens de card de imóveis
soup = soup_complete_source.find(class_='results-list js-results-list')
# Web-Scraping
# para cada elemento no conjunto de cards, colete:
for line in soup.findAll(class_="js-card-selector"):
# colete o endereço completo e o bairro
try:
full_address=line.find(class_="property-card__address").text.strip()
address.append(full_address.replace('\n', '')) #Get all address
if full_address[:3]=='Rua' or full_address[:7]=='Avenida' or full_address[:8]=='Travessa' or full_address[:7]=='Alameda':
neighbor_first=full_address.strip().find('-')
neighbor_second=full_address.strip().find(',', neighbor_first)
if neighbor_second!=-1:
neighbor_text=full_address.strip()[neighbor_first+2:neighbor_second]
neighbor.append(neighbor_text) # Guarde na lista todos os bairros
else: # Bairro não encontrado
neighbor_text='-'
neighbor.append(neighbor_text) # Caso o bairro não seja encontrado
else:
get_comma=full_address.find(',')
if get_comma!=-1:
neighbor_text=full_address[:get_comma]
neighbor.append(neighbor_text) # Guarde na lista todos os bairros com problema de formatação provenientes do proprio website
else:
get_hif=full_address.find('-')
neighbor_text=full_address[:get_hif]
neighbor.append(neighbor_text)
# Coleta o link
full_link=line.find(class_='property-card__main-info').a.get('href')
link_imovel.append(full_link)
# Coleta o anunciante
full_anunciante=line.find(class_='property-card__account-link js-property-card-account-link').img.get('alt').title()
anunciante.append(full_anunciante)
# Coleta a área
full_area=line.find(class_="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area").text.strip()
area.append(full_area)
# Coleta tipologia
full_tipo = line.find(class_='property-card__title js-cardLink js-card-title').text.split()[0]
full_tipo=full_tipo.replace(' ','')
full_tipo=full_tipo.replace('\n','')
tipo.append(full_tipo)
# Coleta numero de quartos
full_room=line.find(class_="property-card__detail-item property-card__detail-room js-property-detail-rooms").text.strip()
full_room=full_room.replace(' ','')
full_room=full_room.replace('\n','')
full_room=full_room.replace('Quartos','')
full_room=full_room.replace('Quarto','')
room.append(full_room) #Get apto's rooms
# Coleta numero de banheiros
full_bath=line.find(class_="property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom").text.strip()
full_bath=full_bath.replace(' ','')
full_bath=full_bath.replace('\n','')
full_bath=full_bath.replace('Banheiros','')
full_bath=full_bath.replace('Banheiro','')
bath.append(full_bath) #Get apto's Bathrooms
# Coleta numero de vagas de garagem
full_park=line.find(class_="property-card__detail-item property-card__detail-garage js-property-detail-garages").text.strip()
full_park=full_park.replace(' ','')
full_park=full_park.replace('\n','')
full_park=full_park.replace('Vagas','')
full_park=full_park.replace('Vaga','')
park.append(full_park) #Get apto's parking lot
# Coleta preço
full_price=re.sub('[^0-9]','',line.find(class_="property-card__price js-property-card-prices js-property-card__price-small").text.strip())
price.append(full_price) #Get apto's parking lot
except:
continue
# fecha o chromedriver
driver.quit()
# cria um dataframe pandas e salva como um arquivo CSV
for i in range(0,len(neighbor)):
combinacao=[link_imovel[i],address[i],neighbor[i],anunciante[i],area[i],tipo[i],room[i],bath[i],park[i],price[i]]
df=pd.DataFrame(combinacao)
with open('VivaRealData.csv', 'a', encoding='utf-16', newline='') as f:
df.transpose().to_csv(f, encoding='iso-8859-1', header=False)
# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print('Finished in ' + str(get_time) + ' seconds')
print(str(len(price))+' results!')
it seems to me that the "for line in soup.findAll" never pops, I've tried everything but I always get data from the first page.
Indeed the URL does return the same results regardless of the page number requested. It also returns the same information if requests is used avoiding the huge overhead of using Selenium.
A better (and much faster) approach is to access all of the data directly from the site's JSON API.
The following shows you a possible starting point. All of the data is inside data, you just need to find the information you want inside it and access it. I suggest you print(data) and use a tool to format it better.
import requests, re, time, os, csv
# Ele irá solicitar quantas páginas você deseja coletar
#pages_number = int(input('How many pages? '))
pages_number = 5
# inicializa o tempo de execução
tic = time.time()
sess = requests.Session()
params = {
'addressCity' : 'Poços de Caldas',
'addressLocationId' : 'BR>Minas Gerais>NULL>Pocos de Caldas',
'addressNeighborhood' : '',
'addressState' : 'Minas Gerais',
'addressCountry' : 'Brasil',
'addressStreet' : '',
'addressZone' : '',
'addressPointLat' : '-21.7854',
'addressPointLon' : '-46.561934',
'business' : 'SALE',
'facets' : 'amenities',
'unitTypes' : 'HOME',
'unitSubTypes' : 'UnitSubType_NONE,SINGLE_STOREY_HOUSE,VILLAGE_HOUSE,KITNET',
'unitTypesV3' : 'HOME',
'usageTypes' : 'RESIDENTIAL',
'listingType' : 'USED',
'parentId' : 'null',
'categoryPage' : 'RESULT',
'includeFields' : 'search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount),page,seasonalCampaigns,fullUriFragments,nearby(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),expansion(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones,phones),developments(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount)),owners(search(result(listings(listing(displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,unitTypes,nonActivationReason,propertyType,unitSubTypes,id,portal,parkingSpaces,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,bedrooms,pricingInfos,showPrice,status,advertiserContact,videoTourLink,whatsappNumber,stamps),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,phones),medias,accountLink,link)),totalCount))',
'size' : '100',
'from' : '144',
'q' : '',
'developmentsSize' : '5',
'__vt' : '',
'levels' : 'CITY,UNIT_TYPE',
'ref' : '/venda/minas-gerais/pocos-de-caldas/casa_residencial/',
'pointRadius' : '',
'isPOIQuery' : '',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'x-domain': 'www.vivareal.com.br',
}
results = 0
with open('VivaRealData.csv', 'w', newline='', encoding='utf-16') as f_output:
csv_output = csv.writer(f_output)
# Criando o loop entre as paginas do site
for page in range(pages_number+1):
print(f"Page {page+1}")
link = 'https://glue-api.vivareal.com/v2/listings'
params['from'] = f"{page * 100}"
req = sess.get(link, headers=headers, params=params)
data = req.json()
for listing in data['search']['result']['listings']:
href = listing['link']['href']
street = listing['listing']['address'].get('street', '').strip()
bedrooms = listing['listing']['bedrooms'][0]
bathrooms = listing['listing']['bathrooms'][0]
price = listing['listing']['pricingInfos'][0]['price']
row = [href, street, bedrooms, bathrooms, price]
csv_output.writerow(row)
results += 1
# Tempo de execução
toc = time.time()
get_time=round(toc-tic,3)
print(f'Finished in {get_time} seconds')
print(f'{results} results!')
For this example, it is hard coded to 5 pages and returns 593 results in about 6 seconds.
Using Pandas might be a bit overkill here as the data can be written a row at a time directly to your output CSV file.
How was this solved?
Your best friend here is your browser's network dev tools. With this you can watch the requests made to obtain the information. The normal process flow is the initial HTML page is downloaded, this runs the javascript and requests more data to further fill the page.
The trick is to first locate where the data you want is (often returned as JSON), then determine what you need to recreate the parameters needed to make the request for it.
Approaches using Selenium allow the javascript to work, but most times this is not needed as it is just making requests and formatting the data for display.

Problem with save in csv information of web scraping

I am working on retrieving information from pages through web scraping, my code does not throw errors, but I have problems wanting to save this information in a kind of database. I leave my code to see if anyone can help me, I create the csv file, but at the moment, it saves absolutely nothing:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.request import urlopen
#MOTOR DE BUSQUEDA
#Rastreo de sitios a traves de la barra de búsqueda
class Content: #Pensamos las noticias como objetos o elementos, por ende creamos la clase Content para extraer el contenido del sitio "el ciudadano"
def __init__(self, topic, url, title, body, data): #se crea de esta forma una clase, donde tenemos el titulo, el cuerpo, la url y el topico de búsqueda.
self.topic = topic #leemos la palabra clave de búsqueda
self.title = title #leemos el titulo
self.body = body #leemos el cuerpo
self.url = url #leemos la URL
self.data = data #leemos la fecha de publicación
def print(self):
print("New article found for topic: {}".format(self.topic))
print("TITLE: {}".format(self.title))
print("BODY:\n{}".format(self.body))
print("URL: {}".format(self.url))
print("DATA: {}".format(self.data))
class Website: #Clase que guarda las propiedades del sitio
"""
Contiene informacion sobre la estructura del sitio web
"""
def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag, dataTag):
self.name = name
self.url = url
self.searchUrl = searchUrl #contiene al boton de busqueda
self.resultListing = resultListing #elemento que ve en resultado todas las listas encontradas
self.resultUrl = resultUrl #define la etiqueta donde se encuentra el link que queremos acceder
self.absoluteUrl=absoluteUrl #creamos el valor boleano para ver si la url es absoluta o relativa,
#absoluta = True relativa=False
self.titleTag = titleTag #etiqueta del titulo de la noticia
self.bodyTag = bodyTag #etiqueta del cuerpo de la noticia
self.dataTag = dataTag #etiqueta de la fecha de publicacion de la noticia
class Crawler: #Clase que toma la URL y devuelve el objeto BeautifulSoup
#Aqui trata las excepciones y posibles errores
def getPage(self, url): #funcion que toma una url y devuelve un objeto BeautifulSoup
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
#funcion de utilidad que nos encontrara los elementos dentro del BeautifulSoup
def safeGet(self, pageObj, selector):
childObj = pageObj.select(selector)
if childObj is not None and len(childObj) > 0:
#return childObj[0].get_text() solo entrega el primero
return '\n'.join(
[elem.get_text() for elem in childObj]) #entrega todos las busquedas
return "" #Si no pasa eso, retorna vacio o none.
def search(self, topic, site): #ingresamos un tópico y el sitio
"""
Busca en un sitio web determinado un tema determinado y registra todas las paginas
encontradas
"""
bs = self.getPage(site.searchUrl + topic) #recibe la URL del sitio con el tópico
searchResults = bs.select(site.resultListing) #definimos el objeto que contiene todos los resultados
registrocontenido = []
for result in searchResults: #como son varios, accedimos uno por uno
url = result.select(site.resultUrl)[0].attrs["href"] #el atributo href contiene los links
# Verifica si es una URL relativa o absoluta.
if(site.absoluteUrl):
bs = self.getPage(url) #si es absoluta
else:
bs = self.getPage(site.url + url) #si es relativa
if bs is None:
print("Tenemos un problema!!")
return
title = self.safeGet(bs, site.titleTag) #usamos la funcion de utilidad safeGet
body = self.safeGet(bs, site.bodyTag)
data = self.safeGet(bs, site.dataTag)
if title != '' and body != '':
#Si titulo y cuerpo son distintos de vacios, imprimos.
content = Content(topic, url, title, body, data)
content.print()
registrocontenido.append(content)
return registrocontenido
def writeArticles(filename, articles):
csvFile = open(filename,'wt+',encoding='utf-8')
writer=csv.writer(csvFile)
try:
for article in articles:
csvrow = [article,topic,article.title,article.data,article.body,article.url]
finally:
csvFile.close()
crawler = Crawler()
#siteData=[nombre, urlprincipal,url busqueda, etiquetaresultado,etiqueta título en lista de resultados,url absoluta,titulo de la noticia, cuerpo, fecha]
siteData = [['El ciudadano', 'https://www.elciudadano.com/', 'https://www.elciudadano.com/?s=', 'div.td_module_16 ','h3.entry-title a', True, 'h1.entry-title', 'div.td-post-content p', 'time']]
sites = []
for row in siteData:
sites.append(Website(row[0], row[1], row[2],row[3], row[4], row[5], row[6], row[7], row[8]))
topics = ['PYTHON'] #variable de topicos que queremos extraer
articles = []
for topic in topics:
print("GETTING INFO ABOUT: " + topic)
for targetSite in sites: #for para recorrer los sitios
articles.extend(crawler.search(topic, targetSite))
crawler.search(topic, targetSite) #llamamos a la funcion search
writeArticles('Articulos.csv', articles)
I appreciate any help or suggestion please!
In your method to save the content to csv, you are not writing to the file. the code just opens and closes the file pointer. Use writer.writerow()
def writeArticles(filename, articles):
csvFile = open(filename,'wt+',encoding='utf-8')
writer=csv.writer(csvFile)
try:
for article in articles:
writer.writerow([article,topic,article.title,article.data,article.body,article.url])
finally:
csvFile.close()

Accessing multiple tags inside one tag

I´ve the following HTML code to webscrape:
<ul class="item-features">
<li>
<strong>Graphic Type:</strong> Dedicated Card
</li>
<li>
<strong>Resolution:</strong> 3840 x 2160
</li>
<li>
<strong>Weight:</strong> 4.40 lbs.
</li>
<li>
<strong>Color:</strong> Black
</li>
</ul>
I would like to print in a .csv file all single tags inside the : Graphic Type, Resolution, Weight, etc. in different columns in a .csv file.
I´ve tried the following in Python:
import bs4
from urllib.request import urlopen as req
from bs4 import BeautifulSoup as soup
url ='https://www.newegg.com/Laptops-Notebooks/SubCategory/ID-32?Tid=6740'
Client = req(url)
pagina = Client.read()
Client.close()
pagina_soup=soup(pagina,"html.parser")
productes = pagina_soup.findAll("div",{"class":"item-container})
producte = productes [0]
features = producte.findAll("ul",{"class":"item-features"})
features[0].text
And it displays all the features but just in one single column of the .csv.
'\nGraphic Type: Dedicated CardResolution: 3840 x 2160Weight: 4.40 lbs.Color: Black\nModel #: AERO 15 OLED SA-7US5020SH\nItem #: N82E16834233268\nReturn Policy: Standard Return Policy\n'
I don´t now how to export them one by one. Please, see my whole pyhton code:
import bs4
from urllib.request import urlopen as req
from bs4 import BeautifulSoup as soup
#Link de la pàgina on farem webscraping
url = 'https://www.newegg.com/Laptops-Notebooks/SubCategory/ID-32?Tid=6740'
#Obrim una connexió amb la pàgina web
Client = req(url)
#Offloads the content of the page into a variable
pagina = Client.read()
#Closes the client
Client.close()
#html parser
pagina_soup=soup(pagina,"html.parser")
#grabs each product
productes = pagina_soup.findAll("div",{"class":"item-container"})
#Obrim un axiu .csv
filename = "ordinadors.csv"
f=open(filename,"w")
#Capçaleres del meu arxiu .csv
headers = "Marca; Producte; PreuActual; PreuAnterior; Rebaixa; CostEnvio
\n"
#Escrivim la capçalera
f.write(headers)
#Fem un loop sobre tots els productes
for producte in productes:
#Agafem la marca del producte
marca_productes = producte.findAll("div",{"class":"item-info"})
marca = marca_productes[0].div.a.img["title"]
#Agafem el nom del producte
name = producte.a.img["title"]
#Preu Actual
actual_productes = producte.findAll("li",{"class":"price-current"})
preuActual = actual_productes[0].strong.text
#Preu anterior
try:
preuAbans = producte.find("li", class_="price-
was").next_element.strip()
except:
print("Not found")
#Agafem els costes de envio
costos_productes = producte.findAll("li",{"class":"price-ship"})
#Com que es tracta d'un vector, agafem el primer element i el netegem.
costos = costos_productes[0].text.strip()
#Writing the file
f.write(marca + ";" + name.replace(","," ") + ";" + preuActual + ";"
+ preuAbans + ";" + costos + "\n")
f.close()
keys = [x.find().text for x in pagina_soup.find_all('li')]
values = [x.find('strong').next_sibling.strip() for x in pagina_soup.find_all('li')]
print(keys)
print(values)
out:
Out[6]: ['Graphic Type:', 'Resolution:', 'Weight:', 'Color:']
Out[7]: ['Dedicated Card', '3840 x 2160', '4.40 lbs.', 'Black']

Parsing webpage that is all text

I'm trying to parse webpage that is a plain text document, it's encoded in HTML so I tried using BeautifulSoup to pull out the text and make a list, but I wasn't able to.
<body>
<pre>
--------------------
BDMEP - INMET
--------------------
Estação : PONTA PORA - MS (OMM: 83702)
Latitude (graus) : -22.55
Longitude (graus) : -55.71
Altitude (metros): 650.00
Estação Operante
Inicio de operação: 24/11/1941
Periodo solicitado dos dados: 01/01/2015 a 17/11/2016
Os dados listados abaixo são os que encontram-se digitados no BDMEP
Hora em UTC
--------------------
Obs.: Os dados aparecem separados por ; (ponto e vírgula) no formato txt.
Para o formato planilha XLS,
siga as instruções
--------------------
Estacao;Data;Hora;Precipitacao;TempMaxima;TempMinima;Insolacao;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media;
83702;01/01/2015;0000;;;;;;;73.5;3.333333;
83702;06/01/2016;1200;5;;;;;;;;
83702;07/01/2016;0000;;;;;;;76.25;2.40072;
83702;01/02/2016;1200;15.2;;;;;;;;
</pre>
</body>
I'm interested in:
Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media;
83702;01/01/2015;0000;;;;;;;73.5;3.333333;
83702;06/01/2016;1200;5;;;;;;;;
83702;07/01/2016;0000;;;;;;;76.25;2.40072;
83702;01/02/2016;1200;15.2;;;;;;;;
Ideally to construct a DataFrame and save as a CSV.
So far I tried stuff like:
soup = BeautifulSoup(a.content, 'html.parser')
soup = soup.find_all('pre')
text = []
for i in soup:
print(i)
text.append(i)
But it has not done the trick. It makes it all one entry in the list.
BS is usefull for HTML tags but you have mostly text so use string functions like split('\n') and slicing [start_row:end_row]
your HTML text
content = '''<body>
<pre>
--------------------
BDMEP - INMET
--------------------
Estação : PONTA PORA - MS (OMM: 83702)
Latitude (graus) : -22.55
Longitude (graus) : -55.71
Altitude (metros): 650.00
Estação Operante
Inicio de operação: 24/11/1941
Periodo solicitado dos dados: 01/01/2015 a 17/11/2016
Os dados listados abaixo são os que encontram-se digitados no BDMEP
Hora em UTC
--------------------
Obs.: Os dados aparecem separados por ; (ponto e vírgula) no formato txt.
Para o formato planilha XLS,
siga as instruções
--------------------
Estacao;Data;Hora;Precipitacao;TempMaxima;TempMinima;Insolacao;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media;
83702;01/01/2015;0000;;;;;;;73.5;3.333333;
83702;06/01/2016;1200;5;;;;;;;;
83702;07/01/2016;0000;;;;;;;76.25;2.40072;
83702;01/02/2016;1200;15.2;;;;;;;;
</pre>
</body>'''
and
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
text = soup.find('pre').text
lines = text.split('\n')
print(lines[-6:-1])
or in one line
print(content.split('\n')[-7:-2])
If table has more rows then you can search last ---------------- to find start of table
last = content.rfind(' --------------------')
lines = content[last:].split('\n')
print(lines[1:-2])
And now you can split lines into columns using split(';') to create data for pandas :)
Or use io.StringIO to create file-like object in memory and use pd.read_csv()
import pandas as pd
import io
last = content.rfind(' --------------------')
lines = content[last:].split('\n')[1:-2]
# create one string with table
text = '\n'.join(lines)
# create file-like object with text
fileobject = io.StringIO(text)
# use file-like object with read_csv()
df = pd.read_csv(fileobject, delimiter=';')
print(df)
or
import pandas as pd
import io
start = content.rfind(' --------------------')
start += len(' --------------------')
end = content.rfind(' </pre>')
text = content[start:end]
fileobject = io.StringIO(text)
df = pd.read_csv(fileobject, delimiter=';')
print(df)
you need re to do this job
in:
import re
re.findall(r'\w+;.+\n', string=html)
out:
['Estacao;Data;Hora;Precipitacao;TempMaxima;TempMinima;Insolacao;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media;\n',
'83702;01/01/2015;0000;;;;;;;73.5;3.333333;\n',
'83702;06/01/2016;1200;5;;;;;;;;\n',
'83702;07/01/2016;0000;;;;;;;76.25;2.40072;\n',
'83702;01/02/2016;1200;15.2;;;;;;;;\n']

Parsing a tag in HTML

I know that the question has been asked but I think not in this specific situation. If it's the case feel free to show me the case.
I have a HTML file hierarchized (you can view the original here) that way :
<h5 id="foo1">Title 1</h5>
<table class="foo2">
<tbody>
<tr>
<td>
<h3 class="foo3">SomeName1</h3>
<img src="Somesource" alt="SomeName2" title="SomeTitle"><br>
<p class="textcode">
Some precious text here
</p>
</td>
...
</table>
I would like to extract the name, the image and the text contained in the <p> each table data in each h5 separately meaning I would like to save each one of these items in a separate folder named after the h5 therein.
I tried this :
# coding: utf-8
import os
import re
from bs4 import BeautifulSoup as bs
os.chdir("WorkingDirectory")
# Sélection du HTML et remplissage de son contenu dans la variable éponyme
with open("TheGoodPath.htm","r") as html:
html = bs(html,'html.parser')
# Sélection des hearders, restriction des résultats aux six premiers et création des dossiers
h5 = html.find_all("h5",limit=6)
for h in h5:
# Création des fichiers avec le nom des headers
chemin = u"../Résulat/"
nom = str(h.contents[0].string)
os.makedirs(chemin + nom,exist_ok=True)
# Sélection de la table soeur située juste après le header
table = h.find_next_sibling(name = 'table')
for t in table:
# Sélection des headers contenant les titres des documents
h3 = t.find_all("h3")
for k in h3:
titre = str(k.string)
# Création des répertoires avec les noms des figures
os.makedirs(chemin + nom + titre,exist_ok=True)
os.fdopen(titre.tex)
# Récupération de l'image située dans la balise soeur située juste après le header précédent
img = k.find_next_sibling("img")
chimg = img.img['src']
os.fdopen(img.img['title'])
# Récupération du code TikZ située dans la balise soeur située juste après le header précédent
tikz = k.find_next_sibling('p')
# Extraction du code TikZ contenu dans la balise précédemment récupérée
code = tikz.get_text()
# Définition puis écriture du préambule et du code nécessaire à la production de l'image précédemment enregistrée
preambule = r"%PREAMBULE \n \usepackage{pgfplots} \n \usepackage{tikz} \n \usepackage[european resistor, european voltage, european current]{circuitikz} \n \usetikzlibrary{arrows,shapes,positioning} \n \usetikzlibrary{decorations.markings,decorations.pathmorphing, decorations.pathreplacing} \n \usetikzlibrary{calc,patterns,shapes.geometric} \n %FIN PREAMBULE"
with open(chemin + nom + titre,'w') as result:
result.write(preambule + code)
But it prints AttributeError: 'NavigableString' object has no attribute 'find_next_element' for h3 = t.find_all("h3"), line 21
This seems to be what you want, there only seems to be one table between each h5 so don't iterate over it just use find_next and use the table returned:
from bs4 import BeautifulSoup
import requests
cont = requests.get("http://www.physagreg.fr/schemas-figures-physique-svg-tikz.php").text
soup = BeautifulSoup(cont)
h5s = soup.find_all("h5",limit=6)
for h5 in h5s:
# find first table after
table = h5.find_next("table")
# find all h3 elements in that table
for h3 in table.select("h3"):
print(h3.text)
img = h3.find_next("img")
print(img["src"])
print(img["title"])
print(img.find_next("p").text)
print()
Which gives you output like:
repere-plan.svg
\begin{tikzpicture}[scale=1]
\draw (0,0) --++ (1,1) --++ (3,0) --++ (-1,-1) --++ (-3,0);
\draw [thick] [->] (2,0.5) --++(0,2) node [right] {z};
%thick : gras ; very thick : très gras ; ultra thick : hyper gras
\draw (2,0.5) node [left] {O};
\draw [thick] [->] (2,0.5) --++(-1,-1) node [left] {x};
\draw [thick] [->] (2,0.5) --++(2,0) node [below] {y};
\end{tikzpicture}
Lignes de champ et équipotentielles
images/cours-licence/em3/ligne-champ-equipot.svg
ligne-champ-equipot.svg
\begin{tikzpicture}[scale=0.8]
\draw[->] (-2,0) -- (2,0);
\draw[->] (0,-2) -- (0,2);
\draw node [red] at (-2,1.25) {\scriptsize{Lignes de champ}};
\draw node [blue] at (2,-1.25) {\scriptsize{Equipotentielles}};
\draw[color=red,domain=-3.14:3.14,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={3*sin(\x r)*3*sin(\x r)*5});
%r = angle en radian
%domain permet de définir le domaine dans lequel la fonction sera tracée
%samples=200 permet d'augmenter le nombre de points pour le tracé
%smooth améliore également la qualité de la trace
\draw[color=red,domain=-3.14:3.14,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={2*sin(\x r)*2*sin(\x r)*5});
\draw[color=blue,domain=-pi:pi,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={3*sqrt(abs(cos(\x r)))*15});
\draw[color=blue,domain=-pi:pi,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={2*sqrt(abs(cos(\x r)))*15});
\end{tikzpicture}
Fonction arctangente
images/schemas/math/arctan.svg
arctan.svg
\begin{tikzpicture}[scale=0.8]
\draw[very thin,color=gray] (-pi,pi) grid (-pi,pi);
\draw[->] (-pi,0) -- (pi,0) node[right] {$x$};
\draw[->] (0,-2) -- (0,2);
\draw[color=red,domain=-pi:pi,samples=150] plot ({\x},{rad(atan(\x))} )node[right,red] {$\arctan(x)$};
\draw[color=blue,domain=-pi:pi] plot ({\x},{rad(-atan(\x))} )node[right,blue] {$-\arctan(x)$};
%Le rad() est une autre façon de dire que l'argument est en radian
\end{tikzpicture}
To write all the .svg's to disk:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os import path
cont = requests.get("http://www.physagreg.fr/schemas-figures-physique-svg-tikz.php").text
soup = BeautifulSoup(cont)
base_url = "http://www.physagreg.fr/"
h5s = soup.find_all("h5", limit=6)
for h5 in h5s:
# find first table after
table = h5.find_next("table")
# find all h3 elements in that table
for h3 in table.select("h3"):
print(h3.text)
img = h3.find_next("img")
src, title = img["src"], img["title"]
# join base url and image url
img_url = urljoin(base_url, src)
# open file using title as file name
with open(title, "w") as f:
# requests the img url and write content
f.write(requests.get(img_url).content)
Which will give you arctan.svg courbe-Epeff.svg and all the rest on the page etc..
It looks like (judging by the for t in table loop) you meant to find multiple "table" elements. Use find_next_siblings() instead of find_next_sibling():
table = h.find_next_siblings(name='table')
for t in table:

Categories