I'm trying to web scrape this URL = https://www.ventanillaunicaenfermeria.es/BuscarColegiados.php.
I need to gather the values of "N°cole." column and "Nombre Colegiado" column.
I'm using BeautifulSoup but I get only values of "N°cole." column. How can I fix that?
Thanks!
This is my code:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
page = requests.get('https://www.ventanillaunicaenfermeria.es/BuscarColegiados.php')
soup = BeautifulSoup(page.text, 'html.parser')
data = soup.find_all("span",{'class':'colColegiado'})
numero_col = []
for i in data:
data_num = i.text.strip()
numero_col.append(data_num)
numero_col
['Nº cole.',
'6478',
'13107',
'7341',
'12110',
'5625',
'4877',
'4700',
'9126',
'8444',
'13120',
'5023',
'12235',
'7747',
'17701',
'17391',
'17944',
'17772',
'7230',
'11729',
'17275']
You're currently fetching the values from the wrong html elements - it should be from all <p>s with the resalto class.
import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import numpy as np
page = requests.get('https://www.ventanillaunicaenfermeria.es/BuscarColegiados.php')
soup = BeautifulSoup(page.text, 'html.parser')
data = soup.find_all("p",{'class':'resalto'})
schools = []
for result in data:
data_num = result.contents[0].text.strip()
#numero_col.append(data_num)
data_name = str(result.contents[1])
schools.append((data_num,data_name))
print(schools)
Instead of selecting all p at once, you can loop through the paragraphs in the table only. The following code takes page number and saves the table to a csv file.
import requests
from bs4 import BeautifulSoup
import pandas as pd
pageno = 1
res = requests.get(f'https://www.ventanillaunicaenfermeria.es/BuscarColegiados.php?nombre=&ap=&colegio=&col=&nif=&pagina={pageno}')
soup = BeautifulSoup(res.text,"html.parser")
header = soup.find("div", {"id":"contactaForm"}).find("h4")
cols = [header.find("span").get_text(), header.get_text().replace(header.find("span").get_text(),"")]
data = []
for p in soup.find("div", {"id":"contactaForm"}).find_all("p"):
if len(p['class']) == 0 or p['class'][0] == "resalto":
child = list(p.children)
data.append([child[0].get_text(strip=True), child[1]])
df = pd.DataFrame(data,columns=cols)
df.to_csv("data.csv", index=False)
print(df)
Output:
Nº cole. Nombre colegiado
0 6478 GUADALUPE LAZARO LAZARO
1 13107 JOSE MARIA PIÑA MANZANO
2 7341 HEIKE ELFRIEDE BIRKHOLZ
3 12110 ESTHER TIZON ROLDAN
4 5625 MARIA DOLORES TOMAS GARCIA-VAQUERO
5 4877 MARIA CARMEN CASADO LLAVONA
6 4700 MANUEL GUILABERT ORTEGA-VILLAIZAN
7 9126 MARIA ESPERANZA ASENSIO ALMAZAN
8 8444 CONCEPCION VIALARD RODRIGUEZ
9 13120 NURIA VILLAESCUSA SANCHEZ
10 5023 ARTURO BONET BLANCO
11 12235 ALFONSO JIMENEZ LOPEZ
12 7747 JACOBUS PETRUS SINNIGE
13 17701 ANIA BRAVO FIGUEREDO
14 17391 LUSINE DAMIRCHYAN
15 17944 ISALKOU DJIL MERHBA
16 17772 CARLA DENISSE FIGUEROA PIEDRA
17 7230 MARIA ISABEL VISO CABAÑERO
18 11729 PILAR GARCIA SALAZAR
19 17275 MARIA LOURDES MALLEN LLUIS
Related
I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I'am new in web scraping. Hope somoene help me out and find the problem.
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
film_in= soup.find('tbody').findAll('tr')
#print(film_in)
film = film_in[0]
#print(film)
titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
print(titre.text)
rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
#print(rang)
def remove_parentheses(string):
return string.replace("(","").replace(")","")
année = film.find("span",{'class':'secondaryInfo'}).text
#print(année)
imdb =[]
for films in film_in:
titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
année =(remove_parentheses(film.find("span",{'class':'secondaryInfo'}).text))
dictionnaire = {'film': film,
'rang': rang,
'année':année
}
imdb.append(dictionnaire)
df_imdb = pd.DataFrame(imdb)
print(df_imdb)
I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I need to solve it using urllib, is there a way. thanks in advance
I'am new in web scraping.
You can try the next example:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
#soup = BeautifulSoup(requests.get(url).text,'html.parser')# It's the perfect and powerful
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
imdb = []
film_in = soup.select('table[class="chart full-width"] tr')
for film in film_in[1:]:
titre = film.select_one('.titleColumn a').get_text(strip=True)
rang = film.select_one('[class="ratingColumn imdbRating"] > strong').text
année =film.find("span",{'class':'secondaryInfo'}).get_text(strip=True)
dictionnaire = {'titre': titre,
'rang': rang,
'année':année
}
imdb.append(dictionnaire)
df_imdb = pd.DataFrame(imdb)
print(df_imdb)
Output:
titre rang année
0 The Shawshank Redemption 9.2 (1994)
1 The Godfather 9.2 (1972)
2 The Dark Knight 9.0 (2008)
3 The Godfather Part II 9.0 (1974)
4 12 Angry Men 9.0 (1957)
.. ... ... ...
245 Dersu Uzala 8.0 (1975)
246 Aladdin 8.0 (1992)
247 The Help 8.0 (2011)
248 The Iron Giant 8.0 (1999)
249 Gandhi 8.0 (1982)
[250 rows x 3 columns]
I am writing a small program to fetch stock exchange data using Python. The sample code below makes a request to a URL and it should return the appropriate data. Here is the resource that I am using:
https://python.plainenglish.io/4-python-libraries-to-help-you-make-money-from-webscraping-57ba6d8ce56d
from xml.dom.minidom import Element
from selenium import webdriver
from bs4 import BeautifulSoup
import logging
from selenium.webdriver.common.by import By
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
url = "http://eoddata.com/stocklist/NASDAQ/A.htm"
driver = webdriver.Chrome(executable_path="C:\Program Files\Chrome\chromedriver")
page = driver.get(url)
# TODO: find element by CSS selector
stock_symbol = driver.find_elements(by=By.CSS_SELECTOR, value='#ctl00_cph1_divSymbols')
soup = BeautifulSoup(driver.page_source, features="html.parser")
elements = []
table = soup.find('div', {'id','ct100_cph1_divSymbols'})
logging.info(f"{table}")
I've added a todo for getting the element that I am trying to retrieve from the program.
Expected:
The proper data should be returned.
Actual:
Nothing is returned.
It is most common practice to scrape tables with pandas.read_html() to get its texts, so I would also recommend it.
But to answer your question and follow your approach, select <div> and <table> more specific:
soup.select('#ctl00_cph1_divSymbols table')`
To get and store the data you could iterat the rows and append results to a list:
data = []
for row in soup.select('#ctl00_cph1_divSymbols table tr:has(td)'):
d = dict(zip(soup.select_one('#ctl00_cph1_divSymbols table tr:has(th)').stripped_strings,row.stripped_strings))
d.update({'url': 'https://eoddata.com'+row.a.get('href')})
data.append(d)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://eoddata.com/stocklist/NASDAQ/A.htm"
res = requests.get(url)
soup = BeautifulSoup(res.text)
data = []
for row in soup.select('#ctl00_cph1_divSymbols table tr:has(td)'):
d = dict(zip(soup.select_one('#ctl00_cph1_divSymbols table tr:has(th)').stripped_strings,row.stripped_strings))
d.update({'url': 'https://eoddata.com'+row.a.get('href')})
data.append(d)
pd.DataFrame(data)
Output
Code
Name
High
Low
Close
Volume
Change
url
0
AACG
Ata Creativity Global ADR
1.390
1.360
1.380
8,900
0
https://eoddata.com/stockquote/NASDAQ/AACG.htm
1
AACI
Armada Acquisition Corp I
9.895
9.880
9.880
5,400
-0.001
https://eoddata.com/stockquote/NASDAQ/AACI.htm
2
AACIU
Armada Acquisition Corp I
9.960
9.960
9.960
300
-0.01
https://eoddata.com/stockquote/NASDAQ/AACIU.htm
3
AACIW
Armada Acquisition Corp I WT
0.1900
0.1699
0.1700
36,400
-0.0193
https://eoddata.com/stockquote/NASDAQ/AACIW.htm
4
AADI
Aadi Biosciences Inc
13.40
12.66
12.90
98,500
-0.05
https://eoddata.com/stockquote/NASDAQ/AADI.htm
5
AADR
Advisorshares Dorsey Wright ETF
47.49
46.82
47.49
1,100
0.3
https://eoddata.com/stockquote/NASDAQ/AADR.htm
6
AAL
American Airlines Gp
14.44
13.70
14.31
45,193,100
-0.46
https://eoddata.com/stockquote/NASDAQ/AAL.htm
...
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
#put all item in this array
response = requests.get('http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh')
soup = BeautifulSoup(response.content, 'html.parser')
table=soup.find_all('table', class_='expo-table general-color')
for row in table:
for up in row.find_all('td'):
text_list = [text for text in up.stripped_strings]
print(text_list)
These code is working good and they will get me the correct output but they will not give output in these format as you seen below I want output in these format can you help me
Indirizzo Bliedinghauserstrasse 27
Città Remscheid
Nazionalità Germania
Sito web www.amannesmann.de
Stand Pad. 3 E14 F11
Telefono +492191989-0
Fax +492191989-201
E-mail sales#mannesmann.de
Membro di Cecimo
Social
pandas has a builtin html table scraper, so you can run:
df = pd.read_html('http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh')
This returns a list of all tables on the page as dataframes, you can access your data with df[0]:
0
1
0
Indirizzo
Bliedinghauserstrasse 27
1
Città
Remscheid
2
Nazionalità
Germania
3
Sito web
www.amannesmann.de
4
Stand
Pad. 3 E14 F11
5
Telefono
+492191989-0
6
Fax
+492191989-201
7
E-mail
sales#mannesmann.de
8
Membro di
nan
9
Social
nan
You can use .get_text() method to extract text and use parameters to avoid whitespaces and give extra space using separator
data=table.find_all("tr")
for i in data:
print(i.get_text(strip=True,separator=" "))
Output:
Indirizzo Bliedinghauserstrasse 27
Città Remscheid
...
Instead of selecting <td>, select <tr> and use .stripped_strings on it to get the row wise data and then append them to the Dataframe.
Here is the code
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
#put all item in this array
temp = []
response = requests.get('http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh')
soup = BeautifulSoup(response.content, 'html.parser')
table=soup.find_all('table', class_='expo-table general-color')
for row in table:
for up in row.find_all('tr'):
temp.append([text for text in up.stripped_strings])
df = pd.DataFrame(temp)
print(df)
0 1
0 Indirizzo Bliedinghauserstrasse 27
1 Città Remscheid
2 Nazionalità Germania
3 Sito web www.amannesmann.de
4 Stand Pad. 3 E14 F11
5 Telefono +492191989-0
6 Fax +492191989-201
7 E-mail sales#mannesmann.de
8 Membro di None
9 Social None
I am trying to scrape the proxy list of this site. However I can't find the the value inside the textarea tag.
Here is my code:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://openproxy.space/list/azneonYD26")
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find('section', class_='data')
rows =results.find('textarea')
print(rows)
Actually, you can scrape that <script> tag and extract all proxy data (contry, count, all the IPs) with a bit of regex magic and some chained replace().
Here's how:
import json
import re
import requests
from bs4 import BeautifulSoup
page = requests.get("https://openproxy.space/list/azneonYD26").text
scripts = BeautifulSoup(page, "html.parser").find_all("script")
proxy_script = re.search(r"LIST\",data:(.*),code", scripts[2].string).group(1)
proxy_data = json.loads(
(
re.sub(r":([a-z])", r':"\1"', proxy_script)
.replace("code", '"code"')
.replace("count", '"count"')
.replace("items", '"items"')
.replace("active", '"active"')
)
)
for proxy in proxy_data:
print(proxy["code"], proxy["count"], proxy["items"][0])
Output:
CN 122 222.129.37.240:57114
US 82 98.188.47.132:4145
DE 51 78.46.218.20:12855
IN 15 43.224.10.37:6667
FR 9 51.195.91.196:9095
AR 8 186.126.181.223:1080
RU 7 217.28.221.10:30005
GB g 46.101.24.42:1080
SG g 8.210.163.246:50001
NL f 188.166.34.137:9000
BD 3 103.85.232.20:1080
NO d 146.59.156.73:9095
CA d 204.101.61.82:4145
BR d 179.189.226.186:8080
HK b 119.28.128.211:1080
AU b 139.99.237.180:9095
VN b 123.16.56.161:1080
KR b 125.135.221.94:54398
TH b 101.108.25.227:9999
BG b 46.10.218.194:1080
AT b 195.144.21.185:1080
VE b 200.35.79.77:1080
IE b 52.214.159.193:9080
ES b 185.66.58.142:42647
JP b 139.162.78.109:1080
UA b 46.151.197.254:8080
PL b 147.135.208.13:9095
If you want to view everything just print out the proxy_data variable.
I have some difficulties in saving the results that I am scraping.
Please refer to this code (this code was slightly changed for my specific case):
import bs4, requests
import pandas as pd
import re
import time
headline=[]
corpus=[]
dates=[]
tag=[]
start=1
url="https://www.imolaoggi.it/category/cron/"
while True:
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'html')
headlines=soup.find_all('h3')
corpora=soup.find_all('p')
dates=soup.find_all('time', attrs={'class':'entry-date published updated'})
tags=soup.find_all('span', attrs={'class':'cat-links'})
for t in headlines:
headline.append(t.text)
for s in corpora:
corpus.append(s.text)
for d in date:
dates.append(d.text)
for c in tags:
tag.append(c.text)
if soup.find_all('a', attrs={'class':'page-numbers'}):
url = f"https://www.imolaoggi.it/category/cron/page/{page}"
page +=1
else:
break
Create dataframe
df = pd.DataFrame(list(zip(date, headline, tag, corpus)),
columns =['Date', 'Headlines', 'Tags', 'Corpus'])
I would like to save all the pages from this link. The code works, but it seems that it writes everytime (i.e. every page) two identical sentences for the corpus:
I think this is happening because of the tag I chosen:
corpora=soup.find_all('p')
This causes a misalignment in rows in my dataframe, as data are saved in lists and corpus starts being correctly scraped later, if compared to others.
I hope you cab help to understand how to fix it.
You were close, but your selectors were off, and you mis-naned some of your variables.
I would use css selectors like this:
eadline=[]
corpus=[]
date_list=[]
tag_list=[]
headlines=soup.select('h3.entry-title')
corpora=soup.select('div.entry-meta + p')
dates=soup.select('div.entry-meta span.posted-on')
tags=soup.select('span.cat-links')
for t in headlines:
headline.append(t.text)
for s in corpora:
corpus.append(s.text.strip())
for d in dates:
date_list.append(d.text)
for c in tags:
tag_list.append(c.text)
df = pd.DataFrame(list(zip(date_list, headline, tag_list, corpus)),
columns =['Date', 'Headlines', 'Tags', 'Corpus'])
df
Output:
Date Headlines Tags Corpus
0 30 Ottobre 2020 Roma: con spranga di ferro danneggia 50 auto i... CRONACA, NEWS Notte di vandalismi a Colli Albani dove un uom...
1 30 Ottobre 2020\n30 Ottobre 2020 Aggressione con machete: grave un 28enne, arre... CRONACA, NEWS Roma - Ha impugnato il suo machete e lo ha agi...
2 30 Ottobre 2020\n30 Ottobre 2020 Deep State e globalismo, Mons. Viganò scrive a... CRONACA, NEWS LETTERA APERTA\r\nAL PRESIDENTE DEGLI STATI UN...
3 30 Ottobre 2020 Meluzzi e Scandurra: “Sacrificare libertà per ... CRONACA, NEWS "Sacrificare la libertà per la sicurezza è un ...
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def main(req, num):
r = req.get("https://www.imolaoggi.it/category/cron/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
goal = [(x.time.text, x.h3.a.text, x.select_one("span.cat-links").get_text(strip=True), x.p.get_text(strip=True))
for x in soup.select("div.entry-content")]
return goal
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, num) for num in range(1, 2937)]
allin = []
for f in fs:
allin.extend(f.result())
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Tags", "Content"])
print(df)
df.to_csv("result.csv", index=False)