bs4 remove None tag after decompose - python

I want to delete advertisment text from scraped data but after i decompose it i get error saying
list index out of range
I think its becouse after decompose is blank space or somthing. Without decompose loop works ok.
import requests
from bs4 import BeautifulSoup
url = 'https://www.marketbeat.com/insider-trades/ceo-share-buys-and-sales/'
companyName = 'title-area'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('table')[0].tbody.find_all('tr')
# delete advertisment
soup.find("tr", class_="bottom-sort").decompose()
for el in table:
print(el.find_all('td')[0].text)

You can use tag.extract() to delete the tag. Also, delete the tag before you find all <tr> tags:
import requests
from bs4 import BeautifulSoup
url = "https://www.marketbeat.com/insider-trades/ceo-share-buys-and-sales/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# delete advertisment
for tr in soup.select("tr.bottom-sort"):
tr.extract()
table = soup.find_all("table")[0].tbody.find_all("tr")
for el in table:
print(el.find_all("td")[0].text)
Prints:
...
TZOOTravelzoo
NEOGNeogen Co.
RKTRocket Companies, Inc.
FINWFinWise Bancorp
WMPNWilliam Penn Bancorporation

There is nothing wrong using decompose() you only have to pay attention to the order in your process:
# first delete advertisment
soup.find("tr", class_="bottom-sort").decompose()
# then select the table rows
table = soup.find_all('table')[0].tbody.find_all('tr')

Related

Retrieving td value from tr that has certain other td value

Need to get the links from a td in rows that has a certain td value.
this is a tr in the table and I want to get the link from the div "Match" if the div "Home team" is of a certain value. There are many rows and I want to find every link that is matching. I have tried this and every time I only get the first row of the table. Here is the link https://wp.nif.no/PageTournamentDetailWithMatches.aspx?tournamentId=403373&seasonId=200937&number=all . Note that I translated some of the values to English in the examples below
homegames = browser.find_elements_by_xpath('//div[#data-title = "Home team"]/a[text()="Cleveland"]//parent::div//parent::td//parent::tr')
for link in homegames:
print(link.find_element_by_xpath('//td[3]/div/a').get_attribute('href'))
<td><div data-title="Date">23.10.2021</div></td>
<td><div data-title="Tid">16:15</div></td>
<td>div data-title="Matchnr">
2121503051
</div>
</td><td><div data-title="Home team">Cleveland</div></td>
<td><div data-title="Away team">
Ohio Travellers</div></td>
<td><div data-title="Court">F21</div></td><td><div data-title="Result">71 - 64</div></td>
<td><div data-title="Referee">John Doe<br>Will Smith<br></div></td></tr>```
The data is within the html source (so no need to use Selenium). But regardless of using Selenium or not, what you can do here is let BeautifulSoup find the specific tags you are after.
Without Selenium, it requires a little manipulation as decode the html.
import requests
from bs4 import BeautifulSoup
import json
import html
keyword = 'Askim'
url = 'https://wp.nif.no/PageTournamentDetailWithMatches.aspx?tournamentId=403373&seasonId=200937&number=all'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('div', {'class':'xwp_table_bg'}).find_next('input')['value']
jsonData = json.loads(jsonStr)
links_list = []
for each in jsonData['data']:
#each = jsonData['data'][6]
htmlStr = ''.join(each)
htmlStr = html.unescape(htmlStr)
soup = BeautifulSoup(htmlStr, 'html.parser')
if soup.find('div', {'data-title':'Hjemmelag'}, text=keyword):
link = soup.find('div', {'data-title':'Kampnr'}).find('a')['href']
links_list.append(link)

Beautifulsoup scraping specific table in page with multiple tables

import requests
from bs4 import BeautifulSoup
results = requests.get("https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists")
src = results.content
soup = BeautifulSoup(src, 'lxml')
trs = soup.find_all("tr")
for tr in trs:
print(tr.text)
This is the code I write for the scraping table from the page "https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists"
If I am only targeting the table in the session "List of most Olympic gold medals over career", how can I specify the table I need? There are 2 sortable jquery-tablesorter so I cannot use the class attribute to select the table I needed.
One more question, if I know that the page I am scraping contains a lot of tables and the one I need always have 10 td in 1 row, can I have something like
If len(td) == 10:
print(tr)
to extract the data I wanted
Update on code:
from bs4 import BeautifulSoup
results = requests.get("https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists")
src = results.content
soup = BeautifulSoup(src, 'lxml')
tbs = soup.find("tbody")
trs = tbs.find_all("tr")
for tr in trs:
print(tr.text)
I have one of the solution, not a good one, just to extract the first table from the page which is the one I needed, any suggestion/ improvement are welcomed!
Thank you.
To only get the first table you can use a CSS Selector nth-of-type(1):
import requests
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
table = soup.select_one("table.wikitable:nth-of-type(1)")
trs = table.find_all("tr")
for tr in trs:
print(tr.text)

Beautifulsoup check span class and rel

I want to check span class (glyphicon icon icon-positive = True) and if so take the value from tr rel (/reestr/clients/233/members/3567150). How i can do this?
I don't understand how to access this data.
from bs4 import BeautifulSoup
import requests
url = 'http://reestr.nostroy.ru/reestr?m.fulldescription=&m.shortdescription=&m.inn=6674374250&m.ogrnip=&bms.id=&bmt.id=&u.registrationnumber='
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = []
new_news = []
news = soup.findAll('table', class_='items table table-selectable-row table-striped')
Give this a go:
news = soup.find('table', class_='items table table-selectable-row table-striped')
for tr in news.find_all('tr'):
if tr.find('span',class_='glyphicon icon icon-positive'):
print(tr['rel'])
Note I changed the way you find news (with .find instead of .find_all) as there is only one object matching that condition.

What is the proper syntax for .find() in bs4?

I am trying to scrape the bitcoin price off of coinbase and cannot find the proper syntax. When I run the program (without the line with question marks) I get the block of html that I need, but I don't know how to narrow down and retrieve the price itself. Any help appreciated, thanks.
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
data = requests.get(url)
nicedata = data.text
soup = BeautifulSoup(nicedata, 'html.parser')
prettysoup = soup.prettify()
bitcoin = soup.find('h4', {'class':
'Header__StyledHeader-sc-1q6y56a-0 hZxUBM
TextElement__Spacer-sc-18l8wi5-0 hpeTzd'})
price = bitcoin.find('???')
print(price)
The attached image contains the html
To get text from item:
price = bitcoin.text
But this page has many items <h4> with this class but find() gets only first one and it has text Bitcoin, not price from your image. You may need find_all() to get list with all items and then you can use index [index] or slicing [start:end] to get some items, or you can use for-loop to work with every item on list.
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
all_h4 = soup.find_all('h4', {'class': 'Header__StyledHeader-sc-1q6y56a-0 hZxUBM TextElement__Spacer-sc-18l8wi5-0 hpeTzd'})
for h4 in all_h4:
print(h4.text)
It can be easier to work with data if you keep it in list of list or array or DataFrame. But to create list of lists it would be easier to find rows <tr> and inside every row search <h4>
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
all_tr = soup.find_all('tr')
data = []
for tr in all_tr:
row = []
for h4 in tr.find_all('h4'):
row.append(h4.text)
if row: # skip empty row
data.append(row)
for row in data:
print(row)
It doesn't need class to get all h4.
BTW: This page uses JavaScript to append new rows when you scroll page but requests and BeautifulSoup can't run JavaScript - so if you will need all rows then you may need Selenium to control web browser which runs JavaScript

Using the BeautifulSoup find method to obtain data from a table row

I am writing a Python script using BeautifulSoup to scrape values from this webpage: https://uk-air.defra.gov.uk/latest/currentlevels
I want to use soup.find() to get values for "Hourly mean Nitrogen dioxide" and "Last updated" from the table row where the "Monitoring site" is "Edinburgh St Leonards".
As I am new to web scraping I am having a bit of trouble so would be grateful for any help on this.
Scrap all the html tables in a list of tables.
The table index may change, then you should not rely on a row/column index.
A part of the folowing script look up for the index of the searched data. Moreover, it prints the header name: so you know want are the data you get.
from bs4 import BeautifulSoup
import urllib.request
import re
with urllib.request.urlopen('https://uk-air.defra.gov.uk/latest/currentlevels?view=region') as response:
htmlData = response.read()
soup = BeautifulSoup(htmlData, 'html5lib')
tables = soup.find_all('table', attrs={'class':'current_levels_table'})
#what you want to check:
Iwant = ['nitrogen', 'update']
about = 'Edinburgh'
for table in tables:
#get header to have the data (we're looking for) column number and table real names
table_head = table.find('thead')
headrows = table_head.find_all('tr')
measures = headrows[1].find_all('th')
for colnum, measure in enumerate(measures):
index.update({colnum: measure.text.strip() for wanted in Iwant if re.search(wanted+'(?iu)', measure.text)})
#get table content and look for Edinburgh
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cels = row.find_all('td')
rowContent = [cel.text.strip().replace(u'\xa0', u' ').replace(u'\n Timeseries Graph', u'') for cel in cels if cel]
if re.search(about+'(?iu)', rowContent[0]):
for indexwanted, measurewanted in index.items():
print(measurewanted, ':', rowContent[indexwanted])
Making use of the suggestion from d2718nis, you can do it in this way. Of course, many other ways would work too.
First, find the link that has the 'Edinburgh St Leonards' text in it. Then find the grandparent of that link element, which is a tr element. Now identify the td elements in the tr. When you examine the table you see that the columns you want are the 4th and 7th. Get those from all of the td elements as the (0-relative) 3rd and 6th. Finally, display the crude texts of these elements.
You will need to do something clever to extract properly readable strings from these results.
>>> import requests
>>> import bs4
>>> page = requests.get('https://uk-air.defra.gov.uk/latest/currentlevels', headers={'User-Agent': 'Not blank'}).content
>>> soup = bs4.BeautifulSoup(page, 'lxml')
>>> Edinburgh_link = soup.find_all('a',string='Edinburgh St Leonards')[0]
>>> Edinburgh_link
Edinburgh St Leonards
>>> Edinburgh_row = Edinburgh_link.findParent('td').findParent('tr')
>>> Edinburgh_columns = Edinburgh_row.findAll('td')
>>> Edinburgh_columns[3]
<td class="center"><span class="bg_low1 bold">20 (1 Low)</span></td>
>>> Edinburgh_columns[6]
<td>05/08/2017<br/>14:00:00</td>
>>> Edinburgh_columns[3].text
'20\xa0(1\xa0Low)'
>>> Edinburgh_columns[6].text
'05/08/201714:00:00'
you can start with this:
import requests
from bs4 import BeautifulSoup
# Request the page, set headers to prevent 403 Forbidden
page = requests.get(
url='https://uk-air.defra.gov.uk/latest/currentlevels',
headers={'User-Agent': 'Not blank'})
# Get html from page
html = page.text
# BeautifulSoup object
soup = BeautifulSoup(html, 'html5lib')
for table in soup.find_all('table'):
# Print all tables on the page
print(table)

Categories