How to create dictionary from a table using beautifulsoup? - python

I am trying to retrieve data from a table via beautifulsoup, but somehow my (beginner) syntax is wrong:
from bs4 import BeautifulSoup
import requests
main_url = "https://www.apo-in.de/product/acc-akut-600-brausetabletten.24170.html"
req = requests.get(main_url)
soup = BeautifulSoup(req.text, "html.parser")
title = soup.find("div", id = "accordionContent5e95581b6e244")
results = {}
for row in title.findAll('tr'):
aux = row.findAll('td')
results[aux[0].string] = aux[1].string
print(results)
This is the relevant code:
<div id="accordionContent5e95581b6e244" class="panel-collapse collapse in">
<div class="panel-body">
<table class="table" width="100%">
<tbody>
<tr>
<th width="170">PZN</th>
<td>00520917</td>
</tr>
<tr>
<th width="170">Anbieter</th>
<td>Hexal AG</td>
</tr>
My goal is to retrieve a dictionary from the th td cells.
How can this be done in beautifulsoup?

I would suggest use pandas to store data in Data Frame and then import into dictionary.
import pandas as pd
from bs4 import BeautifulSoup
import requests
main_url = "https://www.apo-in.de/product/acc-akut-600-brausetabletten.24170.html"
req = requests.get(main_url)
soup = BeautifulSoup(req.text, "html.parser")
table=soup.select_one(".panel-body >table")
df=pd.read_html(str(table))[0]
print(df.set_index(0).to_dict('dict'))
Output:
{1: {'Rezeptpflichtig': 'nein', 'Anbieter': 'Hexal AG', 'PZN': '00520917', 'Darreichungsform': 'Brausetabletten', 'Wirksubstanz': 'Acetylcystein', 'Monopräparat': 'ja', 'Packungsgröße': '40\xa0St', 'Apothekenpflichtig': 'ja', 'Produktname': 'ACC akut 600mg Hustenlöser'}}

First Mistake : You are using id which varies of you want to scrape more pages .
Second Mistake : aux = row.findAll('td') this will return list of one item because you are not taking into consideration the th tags that means aux[1].string will raise an exception .
Here is the code :
from bs4 import BeautifulSoup
import requests
main_url = "https://www.apo-in.de/product/acc-akut-600-brausetabletten.24170.html"
req = requests.get(main_url)
soup = BeautifulSoup(req.text, "html.parser")
title = soup.find("div", class_="panel-collapse collapse in")
results = {}
for row in title.findAll('tr'):
key = row.find('th')
value = row.find('td')
results[key.text] =value.text.strip()
print(results)
Output:
{'PZN': '00520917', 'Anbieter': 'Hexal AG', 'Packungsgröße': '40\xa0St', 'Produktname': 'ACC akut 600mg Hustenlöser', 'Darreichungsform': 'Brausetabletten', 'Monopräparat': 'ja', 'Wirksubstanz': 'Acetylcystein', 'Rezeptpflichtig': 'nein', 'Apothekenpflichtig': 'ja'}

Related

Retrieving td value from tr that has certain other td value

Need to get the links from a td in rows that has a certain td value.
this is a tr in the table and I want to get the link from the div "Match" if the div "Home team" is of a certain value. There are many rows and I want to find every link that is matching. I have tried this and every time I only get the first row of the table. Here is the link https://wp.nif.no/PageTournamentDetailWithMatches.aspx?tournamentId=403373&seasonId=200937&number=all . Note that I translated some of the values to English in the examples below
homegames = browser.find_elements_by_xpath('//div[#data-title = "Home team"]/a[text()="Cleveland"]//parent::div//parent::td//parent::tr')
for link in homegames:
print(link.find_element_by_xpath('//td[3]/div/a').get_attribute('href'))
<td><div data-title="Date">23.10.2021</div></td>
<td><div data-title="Tid">16:15</div></td>
<td>div data-title="Matchnr">
2121503051
</div>
</td><td><div data-title="Home team">Cleveland</div></td>
<td><div data-title="Away team">
Ohio Travellers</div></td>
<td><div data-title="Court">F21</div></td><td><div data-title="Result">71 - 64</div></td>
<td><div data-title="Referee">John Doe<br>Will Smith<br></div></td></tr>```
The data is within the html source (so no need to use Selenium). But regardless of using Selenium or not, what you can do here is let BeautifulSoup find the specific tags you are after.
Without Selenium, it requires a little manipulation as decode the html.
import requests
from bs4 import BeautifulSoup
import json
import html
keyword = 'Askim'
url = 'https://wp.nif.no/PageTournamentDetailWithMatches.aspx?tournamentId=403373&seasonId=200937&number=all'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('div', {'class':'xwp_table_bg'}).find_next('input')['value']
jsonData = json.loads(jsonStr)
links_list = []
for each in jsonData['data']:
#each = jsonData['data'][6]
htmlStr = ''.join(each)
htmlStr = html.unescape(htmlStr)
soup = BeautifulSoup(htmlStr, 'html.parser')
if soup.find('div', {'data-title':'Hjemmelag'}, text=keyword):
link = soup.find('div', {'data-title':'Kampnr'}).find('a')['href']
links_list.append(link)

Cannot get text of a span attribute using BeautifulSoup

I am trying to get from the following
<span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span></div>
the value of data-nodeid
I did the following
price_nodes = soup.find('span', attrs={'id': 'SkuNumber'})
datanode = price_nodes.select_one('span[data-nodeid]')
But I get "None"
How can I fix this? thank you
If price_nodes is correctly fill
i.e. price_nodes =
<span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span>
You just have to do this:
datanode = price_nodes.get('data-nodeid')
Full code should be:
from bs4 import BeautifulSoup as soup
html = '<div><span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span></div>'
page = soup(html, 'html.parser')
price_nodes = page.find('span', {'id': 'SkuNumber'})
datanode = price_nodes.get('data-nodeid')
from bs4 import BeautifulSoup
html = '<span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span></div>'
soup = BeautifulSoup(html)
price_nodes = soup.find('span', attrs={'id': 'SkuNumber'})
print(price_nodes['data-nodeid'])

Beautiful Soup - find a tag within a tag with string? nth-child?

I'm having some trouble with the following HTML Scrape
res = <div class="gunDetails">
<h4>Specifications</h4>
<ul class="features">
<li><label>Make:</label><span itemprop="brand">Gamo</span></li>
<li><label>Model:</label><span itemprop="model">Coyote Black Tactical</span></li>
<li><label>Licence:</label><span>No Licence</span></li>
<li><label>Orient.:</label><span>Ambidextrous</span></li>
<li><label>Scope:</label><span>Unknown 3-9x32</span></li>
<li><label>Origin:</label><span>Spanish</span></li>
<li><label>Cased:</label><span>Other</span></li>
<li><label>Trigger:</label><span>1</span></li>
<li><label>Condition:</label><span itemprop="itemCondition">Used</span></li>
</ul>
</div>
I'm trying to get the text into its own seperate variable so i can export it to a CSV with my own headers.
Any time i do it i can get all of them in a string, or none at all.
soup = BeautifulSoup(res, 'html.parser')
gun_details = soup.select_one('div.gunDetails')
for tag in gun_details or []:
for tag in gun_details.select("li"):
for tag in gun_details.select('span'):
print(tag.text)
Output
Gamo
Coyote Black Tactical
No License
Ambidextrous
Unknown 3-9x32
Spanish
Other
1
Used
Is there anyway in which i could create a variable for each label text?
something like?
gun_make = gun_details.findAll('label', String="Make:")
print(gun_make).text
This is the Full Code:
from bs4 import BeautifulSoup
import requests
import csv
all_links=[]
labels = []
spans = []
url="https://www.guntrader.uk/dealers/redcar/spencers-sporting-guns/guns?page={}"
for page in range(1,3):
res=requests.get(url.format(page)).text
soup=BeautifulSoup(res,'html.parser')
for link in soup.select('a[href*="/dealers/redcar/spencers-sporting-guns/guns/shotguns"]'):
all_links.append("https://www.guntrader.uk" + link['href'])
print(len(all_links))
for a_link in all_links:
res = requests.get(a_link).text
soup = BeautifulSoup(res, 'html.parser')
gun_details = soup.select('div.gunDetails')
for l in gun_details.select('label'):
labels.append(l.text.replace(':',''))
for s in gun_details.select('span'):
spans.append(s.text)
my_dict = dict(zip(labels, spans))
with open('gundealer.csv','w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in mydict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")
This section seems to work on its own OK giving the correct(ish) output:
from bs4 import BeautifulSoup
import requests
response = request.get('guntrader.uk/guns/air-rifles/gamo/pre-charged-pneumatic/22/coyote-black-tactical-190920182249936')
soup = BeautifulSoup(response.content, 'html.parser')
gun_details = soup.select_one('div.gunDetails')
gun_make = gun_details.select_one('li:nth-child(1)')
print (gun_make.text)
With the output:
Make: Gamo
But I dont know what im doing to mess up the initial response from the loop to not allow the above snippet not to work
Let's try this:
res = """ <div class="gunDetails">
<h4>Specifications</h4>
<ul class="features">
<li><label>Make:</label><span itemprop="brand">Gamo</span></li>
<li><label>Model:</label><span itemprop="model">Coyote Black Tactical</span></li>
<li><label>Licence:</label><span>No Licence</span></li>
<li><label>Orient.:</label><span>Ambidextrous</span></li>
<li><label>Scope:</label><span>Unknown 3-9x32</span></li>
<li><label>Origin:</label><span>Spanish</span></li>
<li><label>Cased:</label><span>Other</span></li>
<li><label>Trigger:</label><span>1</span></li>
<li><label>Condition:</label><span itemprop="itemCondition">Used</span></li>
</ul>
</div>
""
from bs4 import BeautifulSoup as bs
import csv
labels = []
spans = []
soup = bs(res, 'html.parser')
gun_details = soup.select_one('div.gunDetails')
for l in gun_details.select('label'):
labels.append(l.text.replace(':',''))
for s in gun_details.select('span'):
spans.append(s.text)
my_dict = dict(zip(labels, spans))
with open('mycsvfile.csv','w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")
Output:
Make Gamo
Model Coyote Black Tactical
Licence No Licence
Orient. Ambidextrous
Scope Unknown 3-9x32
Origin Spanish
Cased Other
Trigger 1
Condition Used

Scraping multiple data tags from HTML using beautiful Soup

I am attempting to scrape HTML to create a dictionary that includes a pitchers name and his handed-ness. The data-tags are buried--so far I've only been able to collect the pitchers name from the data set. The HTML output (for each player) is as follows:
<div class="pitcher players">
<input name="import-data" type="hidden" value="%5B%7B%22slate_id%22%3A20190%2C%22type%22%3A%22classic%22%2C%22player_id%22%3A%2210893103%22%2C%22salary%22%3A%2211800%22%2C%22position%22%3A%22SP%22%2C%22fpts%22%3A14.96%7D%2C%7B%22slate_id%22%3A20192%2C%22type%22%3A%22classic%22%2C%22player_id%22%3A%2210894893%22%2C%22salary%22%3A%2211800%22%2C%22position%22%3A%22SP%22%2C%22fpts%22%3A14.96%7D%2C%7B%22slate_id%22%3A20193%2C%22type%22%3A%22classic%22%2C%22player_id%22%3A%2210895115%22%2C%22salary%22%3A%2211800%22%2C%22position%22%3A%22SP%22%2C%22fpts%22%3A14.96%7D%5D"/>
<a class="player-popup" data-url="https://rotogrinders.com/players/johnny-cueto-11193?site=draftkings" href="https://rotogrinders.com/players/johnny-cueto-11193">Johnny Cueto</a>
<span class="meta stats">
<span class="stats">
R
</span>
<span class="salary" data-role="salary" data-salary="$11.8K">
$11.8K
</span>
<span class="fpts" data-fpts="14.96" data-product="56" data-role="authorize" title="Projected Points">14.96</span>
I've tinkered and and coming up empty--I'm sure I'm overthinking this. Here is the code I have so far:
import requests
from bs4 import BeautifulSoup
url = "https://rotogrinders.com/lineups/mlb?site=draftkings"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
players_confirmed = {}
results = [soup.find_all("div", {'class':'pitcher players'}]
What's the best way to loop through the results set for the more granular data tag information I need?
I need the text from the HTML beginning with , and handed-ness from the tag
Optimally, I would have a dictionary with the following:
{Johnny Cueto : R, Player 2 : L, ...}
import requests
from bs4 import BeautifulSoup
url = "https://rotogrinders.com/lineups/mlb?site=draftkings"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
players_confirmed = {}
results = soup.find_all("div", {'class': 'pitcher players'})
dicti={}
for j in results:
dicti[j.a.text]=j.select(".stats")[1].text.strip("\n").strip()
just use select or find function of the founded element,and you will be able to iterate

how to preserve links when scraping a table with beautiful soup and pandas

Scraping a web to get a table, using Beautiful soup and Pandas. One of the columns got some urls. When I pass html to pandas, href are lost.
is there any way of preserving the url link just for that column?
Example data (edited for better suit ral case):
<html>
<body>
<table>
<tr>
<td>customer</td>
<td>country</td>
<td>area</td>
<td>website link</td>
</tr>
<tr>
<td>IBM</td>
<td>USA</td>
<td>EMEA</td>
<td>IBM site</td>
</tr>
<tr>
<td>CISCO</td>
<td>USA</td>
<td>EMEA</td>
<td>cisco site</td>
</tr>
<tr>
<td>unknown company</td>
<td>USA</td>
<td>EMEA</td>
<td></td>
</tr>
</table>
</body>
</html>
My python code:
file = open(url,"r")
soup = BeautifulSoup(file, 'lxml')
parsed_table = soup.find_all('table')[1]
df = pd.read_html(str(parsed_table),encoding='utf-8')[0]
df
Output (exported to CSV):
customer;country;area;website
IBM;USA;EMEA;IBM site
CISCO;USA;EMEA;cisco site
unknown company;USA;EMEA;
df output is ok but the link is lost. I need to preserve the link. The URL at least.
any hint?
pd.read_html assumes the data you are interested in is in the text, not the tag attributes. However, it isn't hard to scrape the table yourself:
import bs4 as bs
import pandas as pd
with open(url,"r") as f:
soup = bs.BeautifulSoup(f, 'lxml')
parsed_table = soup.find_all('table')[1]
data = [[td.a['href'] if td.find('a') else
''.join(td.stripped_strings)
for td in row.find_all('td')]
for row in parsed_table.find_all('tr')]
df = pd.DataFrame(data[1:], columns=data[0])
print(df)
yields
customer country area website link
0 IBM USA EMEA http://www.ibm.com
1 CISCO USA EMEA http://www.cisco.com
2 unknown company USA EMEA
Just check if tag exists this way:
import numpy as np
with open(url,"r") as f:
sp = bs.BeautifulSoup(f, 'lxml')
tb = sp.find_all('table')[56]
df = pd.read_html(str(tb),encoding='utf-8', header=0)[0]
df['href'] = [np.where(tag.has_attr('href'),tag.get('href'),"no link") for tag in tb.find_all('a')]
Here is another way to do it if you have more than one link to grab from html table. Instead of making list comprehension I would rather go with separeated for loops so code is more readable to those who are new to python and it is easier to adjust code ore handle errors if they emerge. I hope it will help someone.
soup = BeautifulSoup(html, "lxml")
table = table.find('table')
thead = table.find('thead')
column_names = [th.text.strip() for th in thead.find_all('th')]
data = []
for row in table.find_all('tr'):
row_data = []
for td in row.find_all('td'):
td_check = td.find('a')
if td_check is not None:
link = td.a['href']
row_data.append(link)
else:
not_link = ''.join(td.stripped_strings)
if not_link == '':
not_link = None
row_data.append(not_link)
data.append(row_data)
df = pd.DataFrame(data[1:], columns=column_names)
df_dict = df.to_dict('records')
for row in df_dict:
print(row)

Categories