I know that the question has been asked but I think not in this specific situation. If it's the case feel free to show me the case.
I have a HTML file hierarchized (you can view the original here) that way :
<h5 id="foo1">Title 1</h5>
<table class="foo2">
<tbody>
<tr>
<td>
<h3 class="foo3">SomeName1</h3>
<img src="Somesource" alt="SomeName2" title="SomeTitle"><br>
<p class="textcode">
Some precious text here
</p>
</td>
...
</table>
I would like to extract the name, the image and the text contained in the <p> each table data in each h5 separately meaning I would like to save each one of these items in a separate folder named after the h5 therein.
I tried this :
# coding: utf-8
import os
import re
from bs4 import BeautifulSoup as bs
os.chdir("WorkingDirectory")
# Sélection du HTML et remplissage de son contenu dans la variable éponyme
with open("TheGoodPath.htm","r") as html:
html = bs(html,'html.parser')
# Sélection des hearders, restriction des résultats aux six premiers et création des dossiers
h5 = html.find_all("h5",limit=6)
for h in h5:
# Création des fichiers avec le nom des headers
chemin = u"../Résulat/"
nom = str(h.contents[0].string)
os.makedirs(chemin + nom,exist_ok=True)
# Sélection de la table soeur située juste après le header
table = h.find_next_sibling(name = 'table')
for t in table:
# Sélection des headers contenant les titres des documents
h3 = t.find_all("h3")
for k in h3:
titre = str(k.string)
# Création des répertoires avec les noms des figures
os.makedirs(chemin + nom + titre,exist_ok=True)
os.fdopen(titre.tex)
# Récupération de l'image située dans la balise soeur située juste après le header précédent
img = k.find_next_sibling("img")
chimg = img.img['src']
os.fdopen(img.img['title'])
# Récupération du code TikZ située dans la balise soeur située juste après le header précédent
tikz = k.find_next_sibling('p')
# Extraction du code TikZ contenu dans la balise précédemment récupérée
code = tikz.get_text()
# Définition puis écriture du préambule et du code nécessaire à la production de l'image précédemment enregistrée
preambule = r"%PREAMBULE \n \usepackage{pgfplots} \n \usepackage{tikz} \n \usepackage[european resistor, european voltage, european current]{circuitikz} \n \usetikzlibrary{arrows,shapes,positioning} \n \usetikzlibrary{decorations.markings,decorations.pathmorphing, decorations.pathreplacing} \n \usetikzlibrary{calc,patterns,shapes.geometric} \n %FIN PREAMBULE"
with open(chemin + nom + titre,'w') as result:
result.write(preambule + code)
But it prints AttributeError: 'NavigableString' object has no attribute 'find_next_element' for h3 = t.find_all("h3"), line 21
This seems to be what you want, there only seems to be one table between each h5 so don't iterate over it just use find_next and use the table returned:
from bs4 import BeautifulSoup
import requests
cont = requests.get("http://www.physagreg.fr/schemas-figures-physique-svg-tikz.php").text
soup = BeautifulSoup(cont)
h5s = soup.find_all("h5",limit=6)
for h5 in h5s:
# find first table after
table = h5.find_next("table")
# find all h3 elements in that table
for h3 in table.select("h3"):
print(h3.text)
img = h3.find_next("img")
print(img["src"])
print(img["title"])
print(img.find_next("p").text)
print()
Which gives you output like:
repere-plan.svg
\begin{tikzpicture}[scale=1]
\draw (0,0) --++ (1,1) --++ (3,0) --++ (-1,-1) --++ (-3,0);
\draw [thick] [->] (2,0.5) --++(0,2) node [right] {z};
%thick : gras ; very thick : très gras ; ultra thick : hyper gras
\draw (2,0.5) node [left] {O};
\draw [thick] [->] (2,0.5) --++(-1,-1) node [left] {x};
\draw [thick] [->] (2,0.5) --++(2,0) node [below] {y};
\end{tikzpicture}
Lignes de champ et équipotentielles
images/cours-licence/em3/ligne-champ-equipot.svg
ligne-champ-equipot.svg
\begin{tikzpicture}[scale=0.8]
\draw[->] (-2,0) -- (2,0);
\draw[->] (0,-2) -- (0,2);
\draw node [red] at (-2,1.25) {\scriptsize{Lignes de champ}};
\draw node [blue] at (2,-1.25) {\scriptsize{Equipotentielles}};
\draw[color=red,domain=-3.14:3.14,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={3*sin(\x r)*3*sin(\x r)*5});
%r = angle en radian
%domain permet de définir le domaine dans lequel la fonction sera tracée
%samples=200 permet d'augmenter le nombre de points pour le tracé
%smooth améliore également la qualité de la trace
\draw[color=red,domain=-3.14:3.14,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={2*sin(\x r)*2*sin(\x r)*5});
\draw[color=blue,domain=-pi:pi,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={3*sqrt(abs(cos(\x r)))*15});
\draw[color=blue,domain=-pi:pi,samples=200,smooth] plot (canvas polar cs:angle=\x r,radius={2*sqrt(abs(cos(\x r)))*15});
\end{tikzpicture}
Fonction arctangente
images/schemas/math/arctan.svg
arctan.svg
\begin{tikzpicture}[scale=0.8]
\draw[very thin,color=gray] (-pi,pi) grid (-pi,pi);
\draw[->] (-pi,0) -- (pi,0) node[right] {$x$};
\draw[->] (0,-2) -- (0,2);
\draw[color=red,domain=-pi:pi,samples=150] plot ({\x},{rad(atan(\x))} )node[right,red] {$\arctan(x)$};
\draw[color=blue,domain=-pi:pi] plot ({\x},{rad(-atan(\x))} )node[right,blue] {$-\arctan(x)$};
%Le rad() est une autre façon de dire que l'argument est en radian
\end{tikzpicture}
To write all the .svg's to disk:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os import path
cont = requests.get("http://www.physagreg.fr/schemas-figures-physique-svg-tikz.php").text
soup = BeautifulSoup(cont)
base_url = "http://www.physagreg.fr/"
h5s = soup.find_all("h5", limit=6)
for h5 in h5s:
# find first table after
table = h5.find_next("table")
# find all h3 elements in that table
for h3 in table.select("h3"):
print(h3.text)
img = h3.find_next("img")
src, title = img["src"], img["title"]
# join base url and image url
img_url = urljoin(base_url, src)
# open file using title as file name
with open(title, "w") as f:
# requests the img url and write content
f.write(requests.get(img_url).content)
Which will give you arctan.svg courbe-Epeff.svg and all the rest on the page etc..
It looks like (judging by the for t in table loop) you meant to find multiple "table" elements. Use find_next_siblings() instead of find_next_sibling():
table = h.find_next_siblings(name='table')
for t in table:
Related
I am trying to obtain different character's descriptions and habilities for a dataset.
The problem I've encountered is that there seems to be a span tag within the h2 tag and in some cases a figure before de p tags. This is the format I'm facing:
<h2><span class="mw-headline" id="Apariencia">Apariencia</span></h2>
<figure class="thumb tleft " style="width: 100px">
<p>...</p>
<p>...</p>
<p>...</p>
<h2><span class="mw-headline" id="Personalidad">Personalidad</span></h2>
<p>...</p>
<p>...</p>
<p>...</p>
I need to obtain the text in those paragraphs.
I have tried something like this but it obviously does not work.
import urllib.request
from bs4 import BeautifulSoup
fp = urllib.request.urlopen("https://jojo.fandom.com/es/wiki/Star_Platinum")
mybytes = fp.read()
html_doc = mybytes.decode("utf8")
fp.close()
soup = BeautifulSoup(html_doc, 'html.parser')
spans = soup.find_all('span', {"class": "mw-headline"})
for s in spans:
print(s.nextSibling.getText)
You can search backwards for previous <h2> and store result in the dictionary:
from bs4 import BeautifulSoup
html_doc = '''\
<h2><span class="mw-headline" id="Apariencia">Apariencia</span></h2>
<figure class="thumb tleft " style="width: 100px">
<p>T1 ...</p>
<p>T2 ...</p>
<p>T3 ...</p>
<h2><span class="mw-headline" id="Personalidad">Personalidad</span></h2>
<p>T4 ...</p>
<p>T5 ...</p>
<p>T6 ...</p>'''
soup = BeautifulSoup(html_doc, 'html.parser')
out = {}
for p in soup.select('p'):
previous_h2 = p.find_previous('h2')
out.setdefault(previous_h2.text, []).append(p.text)
print(out)
Prints:
{
'Apariencia': ['T1 ...', 'T2 ...', 'T3 ...'],
'Personalidad': ['T4 ...', 'T5 ...', 'T6 ...']
}
I think in this case, .select with CSS selectors would be very useful.
To get all the section headers, you can use the selector h2:has(span.mw-headline[id]); and to get a specific header by the id attribute value (hId below) of the span nested within, you can use the selector hSel below.
hId = 'Personalidad' # for example
hSel = f'h2:has(span.mw-headline[id="{hId}"])'
Then, to get all the tags after that header, you can use f'{hSel}~*', but you also need to filter out the tags after the next header to only get tags in that section, so the full selector would be
sSel = f'{hSel}~*:not({hSel}~h2~*):not(h2)'
[:not({hSel}~h2~*) filters out the tags after the next header, and :not(h2) filters out the next header tag itself.]
Making use of this, the function below returns a dictionary containing section header, text and html.
def get_wikiSection(header, wSoup, sec1Header='? Abstract ?'):
sSel = 'h2:has(span.mw-headline[id])'
sSel = f'*:has(~{sSel}):not({sSel}~*)'
if not header: hId = hSel = None # [first section has no header]
elif isinstance(header, str):
hId, hSel = header, f'h2:has(span.mw-headline[id="{header}"])'
header = wSoup.select_one(hSel)
if not header: return {'errorMsg': f'Not found: {hSel}'}
else: hId = header.select_one('span.mw-headline[id]')['id']
## header SHOULD BE: None/hId/a tag containing span.mw-headline[id] ##
if hId:
hSel = f'h2:has(span.mw-headline[id="{hId}"])'
sSel = f'{hSel}~*:not({hSel}~h2~*):not(h2)'
header = header.get_text(' ').strip()
else: header = sec1Header
sect = wSoup.select(sSel)
sText = '\n'.join([s.get_text(' ').strip() for s in sect])
sHtml = '\n'.join([''.join(s.prettify().splitlines()) for s in sect])
if not sect: sText = sHtml = None
return {'headerId': hId, 'sectionHeader': header,
'sectionText': sText, 'sectionHtml': sHtml}
For example, get_wikiSection('Personalidad', soup) will return
{
'headerId': 'Personalidad',
'sectionHeader': 'Personalidad',
'sectionText': 'Jotaro describió a Star Platinum como un ser muy violento. Suele ser silencioso, excepto cuando lanza golpes, gritando "ORA ORA ORA" en voz alta, bastante rápido y repetidamente. Con una cara relativamente humana ha demostrado tener expresiones tales como fruncir el ceño y sonreír.\nStar Platinum demuestra un tipo de interés en la auto-conservación, como se ve cuando detiene una bala que Jotaro dispara experimentalmente en su propia cabeza, protege a un Jotaro incapacitado de los ataques de DIO durante los efectos de su The World , y lo revive de cerca de la muerte directamente hacia latir su corazón (Sin embargo, considerando el papel pionero de Star Platinum en la serie, esta capacidad puede hablar principalmente de cualidades metafísicas o subconscientes genéricas para los usuarios de Stand).\nEn el manga original, Star Platinum desde el principio se ve con una sonrisa amplia y desconcertante. Más tarde, Star Platinum gana el rostro estoico de Jotaro, cualquier sonrisa futura va a advertir a la persona que se dirige a un gran dolor inminente.\nStar Platinum lleva el nombre de la carta del Tarot La Estrella , que simboliza el optimismo, el discernimiento y la esperanza.',
'sectionHtml': '<p> Jotaro describió a Star Platinum como un ser muy violento. Suele ser silencioso, excepto cuando lanza golpes, gritando "ORA ORA ORA" en voz alta, bastante rápido y repetidamente. Con una cara relativamente humana ha demostrado tener expresiones tales como fruncir el ceño y sonreír.</p>\n<p> Star Platinum demuestra un tipo de interés en la auto-conservación, como se ve cuando detiene una bala que Jotaro dispara experimentalmente en su propia cabeza, protege a un Jotaro incapacitado de los ataques de DIO durante los efectos de su The World , y lo revive de cerca de la muerte directamente hacia latir su corazón (Sin embargo, considerando el papel pionero de Star Platinum en la serie, esta capacidad puede hablar principalmente de cualidades metafísicas o subconscientes genéricas para los usuarios de Stand).</p>\n<p> En el manga original, Star Platinum desde el principio se ve con una sonrisa amplia y desconcertante. Más tarde, Star Platinum gana el rostro estoico de Jotaro, cualquier sonrisa futura va a advertir a la persona que se dirige a un gran dolor inminente.</p>\n<p> Star Platinum lleva el nombre de la carta del Tarot <a class="extiw" href="http://en.wikipedia.org/wiki/es:La_Estrella_(Tarot)" title="wikipedia:es:La Estrella (Tarot)"> La Estrella </a> , que simboliza el optimismo, el discernimiento y la esperanza.</p>'
}
If you want all the sections:
h2Tags = soup.select('h2:has(span.mw-headline[id])')
wikiSections = [get_wikiSection(h, soup) for h in [None]+h2Tags]
The resulting list of dictionaries can also be converted to a pandas DataFrame as simply as pd.DataFrame(wikiSections).
I'm new to webscraping. I have seen a few tutorials on how to scrape websites using beautifulsoup.
As an exercise I would like to extract data from a real estate website.
The specific page I want to scrape is this one: https://www.immoweb.be/fr/recherche/maison-et-appartement/a-vendre?countries=BE&page=1
My goal is to extract a list of all the links to each real estate sale.
Afterwards, I want to loop through that list of links to extract all the data for each sale (price, location, nb bedrooms etc.)
The first issue I'm encountering is that the data scraped using the classic beautifulsoup code did not match the source code of the webpage.
This is my code:
URL = "https://www.immoweb.be/fr/recherche/maison-et-appartement/a-vendre?countries=BE&page=1"
page = requests.get(URL)
html = page.content
soup = BeautifulSoup(html, 'html.parser')
print(soup)
Hence, when looking for the links of each real estate sale which is located under
soup.find_all("a", class_="card__title-link")
It outputs an empty list. Indeed these tags were actually not properly extracted from my code above.
Why is that? What should I do to ensure that the extracted html correctly corresponds to what is visible in the source code of the website?
Thank you :-)
The data you see is embedded within the page in Json format. You can use this example how to load it:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.immoweb.be/fr/recherche/maison-et-appartement/a-vendre?countries=BE&page=1"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(soup.find("iw-search")[":results"])
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
# print some data:
for ad in data:
print(
"{:<63} {:<8} {}".format(
ad["property"]["title"],
ad["transaction"]["sale"]["price"] or "-",
"https://www.immoweb.be/fr/annonce/{}".format(ad["id"]),
)
)
Prints:
Triplex appartement met 3 slaapkamers en garage. 239000 https://www.immoweb.be/fr/annonce/9309298
Appartement 285000 https://www.immoweb.be/fr/annonce/9309895
Heel ruime, moderne, lichtrijke Duplex te koop, bij centrum 269000 https://www.immoweb.be/fr/annonce/9303797
À VENDRE PAR LANDBERGH : appartement de deux chambres à Gand 359000 https://www.immoweb.be/fr/annonce/9310300
Prachtige nieuwbouw appartementen - https://www.immoweb.be/fr/annonce/9309278
Prachtige nieuwbouw appartementen - https://www.immoweb.be/fr/annonce/9309251
Prachtige nieuwbouw appartementen - https://www.immoweb.be/fr/annonce/9309264
Appartement intéressant avec agréable vue panoramique verdoy 219000 https://www.immoweb.be/fr/annonce/9309366
Projet Utopia by Godin - https://www.immoweb.be/fr/annonce/9309458
Appartement 2-ch avec vue unique! 270000 https://www.immoweb.be/fr/annonce/9309183
Residentieel wonen in Hélécine, dichtbij de natuur en de sne - https://www.immoweb.be/fr/annonce/9309241
Appartement 375000 https://www.immoweb.be/fr/annonce/9309187
DUPLEX LUMIEUX ET SPACIEUX 380000 https://www.immoweb.be/fr/annonce/9298271
SINT-PIETERS-LEEUW / Magnifique maison de ±130m² avec jardin 430000 https://www.immoweb.be/fr/annonce/9310259
PARC PARMENTIER // APP MODERNE 3CH 490000 https://www.immoweb.be/fr/annonce/9262193
BOIS DE LA CAMBRE – AV DE FRE – CLINIQUES DE L’EUROPE 575000 https://www.immoweb.be/fr/annonce/9309664
Entre Stockel et le Stade Fallon 675000 https://www.immoweb.be/fr/annonce/9310094
Maisons neuves dans un cadre verdoyant - https://www.immoweb.be/fr/annonce/6792221
Nieuwbouwproject Dockside Gardens - Gent - https://www.immoweb.be/fr/annonce/9008956
Appartement 139000 https://www.immoweb.be/fr/annonce/9187904
A VENDRE CHEZ LANDBERGH: appartements à Merelbeke Flora - https://www.immoweb.be/fr/annonce/9306877
Très beau studio avec une belle vue sur la plage et la mer! 319000 https://www.immoweb.be/fr/annonce/9306787
BEL APPARTEMENT LUMINEUX DIAMANT / PLASKY 320000 https://www.immoweb.be/fr/annonce/9264748
Un projet d'appartements neufs à proximité de Woluwé-St-Lamb - https://www.immoweb.be/fr/annonce/9308037
PLACE JOURDAN - 2 CHAMBRES 345000 https://www.immoweb.be/fr/annonce/9306953
Magnifiek appartement in de Brugse Rand - Assebroek 399000 https://www.immoweb.be/fr/annonce/9306613
Bien d'exception 415000 https://www.immoweb.be/fr/annonce/9308022
Appartement 435000 https://www.immoweb.be/fr/annonce/9307802
Magnifique maison 5CH - 3SDB - bureau - dressing - garage 465000 https://www.immoweb.be/fr/annonce/9307178
Magnifique maison 5CH - 3SDB - bureau - dressing - garage 465000 https://www.immoweb.be/fr/annonce/9307177
EDIT: Added URL column.
I'm trying to extract tags from an XML file using RE in Python. I need to extract nodes that start with tag "< PE" and their corresponding Unit IDs which are nodes above each tag "<PE". The file can be seen here
When I use the below code, I don't get the correct tags "<unit IDs", that is, the ones that correspond to each tag "<PE". For example, in my output, the content extracted from tag "<PE" with "<Unit ID=250" is actually "<Unit ID=149" in the original file. Besides, the code skips some tags "<Unit ID". Does anyone see in my code where's the error?
import re
t=open('ALICE.per1_replaced.txt','r')
t=t.read()
unitid=re.findall('<unit.*?"pe">', t, re.DOTALL)
PE=re.findall("<PE.*?</PE>", t, re.DOTALL)
a=zip(unitid,PE)
tp=tuple(a)
w=open('Tags.txt','w')
for x, j in tp:
a=x + '\n'+j + '\n'
w.write(a)
w.close()
I've tried this version as well but I had the same problems:
with open('ALICE.per1_replaced.txt','r') as t:
contents = t.read()
unitid=re.findall('<unit.*?"pe">', contents, re.DOTALL)
PE=re.findall('<PE.*?</PE>', contents, re.DOTALL)
with open('PEtagsper1.txt','w') as fi:
for i, p in zip(unitid, PE):
fi.write( "{}\n{}\n".format(i, p))
my desired output is a file with tags "<Unit ID=" followed by the content within the tag that starts with "<PE" and ends with "" as below:
<unit id="16" status="FINISHED" type="pe">
<PE producer="A1.ALICE_GG"><html>
<head>
</head>
<body>
Eu vou me atrasar!' (quando ela voltou a pensar sobre isso mais trade,
ocorreu-lhe que deveria ter achado isso curioso, mas na hora tudo pareceu
bastante natural); mas quando o Coelho de fato tirou um relógio do bolso
do colete e olhou-o, e então se apressou, Alice pôs-se de pé, pois lhe
ocorreu que nunca antes vira um coelho com um colete, ou com um relógio de
bolso pra tirar, e queimando de curiosidade, ela atravessou o campo atrás
dele correndo e, felizmente, chegou justo a tempo de vê-lo entrar dentro
de uma grande toca de coelho sob a cerca.
</body>
</html></PE>
You seem to have multiple tags under each tag (eg, for unit 3), thus the zip doesn't work correctly. As #Error_2646 noted in comments, some XML or beautiful soup package would work better for this job.
But if for whatever reason you want to stick to regex, you can fix this by running a regex on the list of strings returned by the first regex. Example code that worked on the small part of the input I downloaded:
units=re.findall('<unit.*?</unit>', t, re.DOTALL)
unitList = []
for unit in units:
#first get your unit regex
unitid=re.findall('<unit.*?"pe">', unit, re.DOTALL) # same as the one you use
#there should only be one within each
assert (len(unitid) == 1)
#now find all pes for this unit
PE=re.findall("<PE.*?</PE>", unit, re.DOTALL) # same as the one you use
# combine results
output = unitid[0] + "\n"
for pe in PE:
output += pe + "\n"
unitList.append(output)
for x in unitList:
print(x)
first of all, I'll show the code that I'm having problem to in order to better explain myself.
<div class="archivos"> ... </div>
<br>
<br>
<br>
<br>
THIS IS THE TEXT THAT I WANT TO CHECK
<div class="archivos"> ... </div>
...
I'm using Selenium in Python.
So, this is a piece of the html that I'm working with. My objective is, inside the div with "class=archivos", there's a link that i want to click, but for that, I need to first analyze the text that's over it to know if I want to click or not the link.
The problem is that there's no tag on the text, and I can't seem to find a way to copy it so I can search it for the information I want. The text changes every time so I need to locate the possible texts previous to every "class=archivos".
So far I've tried a lot of ways to find it using XPath mainly, trying to get to the previous element of the div. I haven't come with anything that works yet, as I'm not very experienced with Selenium and XPaths.
I've found this https://chercher.tech/python/relative-xpath-selenium-python,which helped me try some XPaths, and several responses here on SO but to no avail.
I've read somewhere that I can use Javascript code from Python using Selenium to get it, but I don't know Javascript and don't know how to do it. Maybe somebody understands what I'm talking about.
This is the webpage if it helps: http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERLST&DOCS=1-200&BASE=BOLE&SEC=FIRMA&SEPARADOR=&PUBL=20200901
Thanks in advance for the help, and I'll provide any further information if it's needed.
Here is example how to extract the previous text with BeautifulSoup. I loaded the page with requests module, but you can feed the HTML source to BeautifulSoup from selenium:
import requests
from bs4 import BeautifulSoup
url = 'http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERLST&DOCS=1-200&BASE=BOLE&SEC=FIRMA&SEPARADOR=&PUBL=20200901'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for t in soup.select('.archivos'):
previous_text = t.find_previous(text=True).strip()
link = t.a['href']
print(previous_text)
print('http://www.boa.aragon.es' + link)
print('-' * 80)
Prints:
ORDEN HAP/804/2020, de 17 de agosto, por la que se modifica la Relación de Puestos de Trabajo de los Departamentos de Industria, Competitividad y Desarrollo Empresarial y de Economía, Planificación y Empleo.
http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERDOC&BASE=BOLE&PIECE=BOLE&DOCS=1-22&DOCR=1&SEC=FIRMA&RNG=200&SEPARADOR=&&PUBL=20200901
--------------------------------------------------------------------------------
ORDEN HAP/805/2020, de 17 de agosto, por la que se modifica la Relación de Puestos de Trabajo del Departamento de Agricultura, Ganadería y Medio Ambiente.
http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERDOC&BASE=BOLE&PIECE=BOLE&DOCS=1-22&DOCR=2&SEC=FIRMA&RNG=200&SEPARADOR=&&PUBL=20200901
--------------------------------------------------------------------------------
ORDEN HAP/806/2020, de 17 de agosto, por la que se modifica la Relación de Puestos de Trabajo del Organismo Autónomo Instituto Aragonés de Servicios Sociales.
http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERDOC&BASE=BOLE&PIECE=BOLE&DOCS=1-22&DOCR=3&SEC=FIRMA&RNG=200&SEPARADOR=&&PUBL=20200901
--------------------------------------------------------------------------------
ORDEN ECD/807/2020, de 24 de agosto, por la que se aprueba el expediente relativo al procedimiento selectivo de acceso al Cuerpo de Catedráticos de Música y Artes Escénicas.
http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERDOC&BASE=BOLE&PIECE=BOLE&DOCS=1-22&DOCR=4&SEC=FIRMA&RNG=200&SEPARADOR=&&PUBL=20200901
--------------------------------------------------------------------------------
RESOLUCIÓN de 28 de julio de 2020, de la Dirección General de Justicia, por la que se convocan a concurso de traslado plazas vacantes entre funcionarios de los Cuerpos y Escalas de Gestión Procesal y Administrativa, Tramitación Procesal y
Administrativa y Auxilio Judicial de la Administración de Justicia.
http://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI?CMD=VERDOC&BASE=BOLE&PIECE=BOLE&DOCS=1-22&DOCR=5&SEC=FIRMA&RNG=200&SEPARADOR=&&PUBL=20200901
--------------------------------------------------------------------------------
...and so on.
I'm using python 3.4.
When I use urllib.request.urlretrieve(link, filename="file.html") on a utf-8 file, the resulting file.html is not properly encoded. How do I make sure the file is encoded using utf-8?
How to implement the .decode(utf-8) in this case?
EDIT
This is the original part of page:
« Écoute, mon peuple, je parle ; Moi, Dieu, je suis ton Dieu ! Je ne t'accuse pas pour tes sacrifices ; tes holocaustes sont toujours devant moi. « Je ne prendrai pas un seul taureau de ton domaine, pas un bélier de tes enclos. Tout le gibier des forêts m'appartient et le bétail des hauts pâturages. « Si j'ai faim, irai-je te le dire ? Le monde et sa richesse m'appartiennent. Vais-je manger la chair des taureaux et boire le sang des béliers ? « Qu'as-tu à réciter mes lois, à garder mon alliance à la bouche, toi qui n'aimes pas les reproches et rejettes loin de toi mes paroles ? »
And this is what I get in the saved file:
� �coute, mon peuple, je parle ;�Moi, Dieu, je suis ton Dieu !�Je ne t'accuse pas pour tes sacrifices ; tes holocaustes sont toujours devant moi.�� Je ne prendrai pas un seul taureau de ton domaine, pas un b�lier de tes enclos.�Tout le gibier des for�ts m'appartient et le b�tail des hauts p�turages. � Si j'ai faim, irai-je te le dire ? Le monde et sa richesse m'appartiennent.�Vais-je manger la chair des taureaux et boire le sang des b�liers ?�� Qu'as-tu � r�citer mes lois,�� garder mon alliance � la bouche,�toi qui n'aimes pas les reproches et rejettes loin de toi mes paroles ?��
I noticed that in certain parts of the page accented characters are not really utf-8 encoded but the browser shows it properly. For example instead of É there is É and when the file is downloaded this seems to cause problems.
You can unescape the HTML escape sequences in the file line by line using the method shown here.
import html.parser
h = html.parser.HTMLParser()
with urllib.request.urlopen(link) as fin, open(
"file.html", 'w', encoding='utf-8') as fout:
for line in fin:
fout.write(h.unescape(line.decode('utf-8')))
I advice to use it handle this for you: It convert the loaded document implecitly to utf-8
markup = "<h1>Sacr\xc3\xa9 bleu!</h1>"
soup = BeautifulSoup(markup)
soup.h1
# <h1>Sacré bleu!</h1>
soup.h1.string
# u'Sacr\xe9 bleu!'
BeautifulSoup documentation: here