Beautifulsoup check span class and rel - python

I want to check span class (glyphicon icon icon-positive = True) and if so take the value from tr rel (/reestr/clients/233/members/3567150). How i can do this?
I don't understand how to access this data.
from bs4 import BeautifulSoup
import requests
url = 'http://reestr.nostroy.ru/reestr?m.fulldescription=&m.shortdescription=&m.inn=6674374250&m.ogrnip=&bms.id=&bmt.id=&u.registrationnumber='
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = []
new_news = []
news = soup.findAll('table', class_='items table table-selectable-row table-striped')

Give this a go:
news = soup.find('table', class_='items table table-selectable-row table-striped')
for tr in news.find_all('tr'):
if tr.find('span',class_='glyphicon icon icon-positive'):
print(tr['rel'])
Note I changed the way you find news (with .find instead of .find_all) as there is only one object matching that condition.

Related

bs4 remove None tag after decompose

I want to delete advertisment text from scraped data but after i decompose it i get error saying
list index out of range
I think its becouse after decompose is blank space or somthing. Without decompose loop works ok.
import requests
from bs4 import BeautifulSoup
url = 'https://www.marketbeat.com/insider-trades/ceo-share-buys-and-sales/'
companyName = 'title-area'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('table')[0].tbody.find_all('tr')
# delete advertisment
soup.find("tr", class_="bottom-sort").decompose()
for el in table:
print(el.find_all('td')[0].text)
You can use tag.extract() to delete the tag. Also, delete the tag before you find all <tr> tags:
import requests
from bs4 import BeautifulSoup
url = "https://www.marketbeat.com/insider-trades/ceo-share-buys-and-sales/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# delete advertisment
for tr in soup.select("tr.bottom-sort"):
tr.extract()
table = soup.find_all("table")[0].tbody.find_all("tr")
for el in table:
print(el.find_all("td")[0].text)
Prints:
...
TZOOTravelzoo
NEOGNeogen Co.
RKTRocket Companies, Inc.
FINWFinWise Bancorp
WMPNWilliam Penn Bancorporation
There is nothing wrong using decompose() you only have to pay attention to the order in your process:
# first delete advertisment
soup.find("tr", class_="bottom-sort").decompose()
# then select the table rows
table = soup.find_all('table')[0].tbody.find_all('tr')

How to list out all the h2, h3, and p tags then create a dataframe to store them

I had given a website to scrape all of the key items
But the output I got is only for one item using BeautifulSoup4. So wonder if I need to use anything like soup.findall to extract all the key items in a list from the website.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url=
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
column= soup.find(class_ = re.compile('columns is-multiline'))
print(column.prettify())
position = column.h2.text
company = column.h3.text
city_state= column.find_all('p')[-2].text
print (position, company, city_state)
Thank you.
Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://realpython.github.io/fake-jobs/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
positions = [pos.text for pos in soup.find_all('h2')]
companies = [com.text for com in soup.find_all('h3')]
city_state0 = []
city_state1 = []
for p in soup.find_all('p', {'class' : 'location'}):
city_state0.append(p.text.split(',')[0].strip())
city_state1.append(p.text.split(',')[1].strip())
df = pd.DataFrame({
'city_state1': city_state0,
'city_state2': city_state1,
'companies' : companies,
'positions' : positions
})
print(df)
Output:
You need to use find_all to get all the elements like so. find only gets the first element.
titles = soup.find_all('h2', class_='title is-5')
companies = soup.find_all('h3', class_='subtitle is-6 company')
locations = soup.find_all('p', class_='location')
# loop over locations and extract the city and state
for location in locations:
city = location.split(', ')[0]
state = location.split(', ')[1]

Beautifulsoup scraping specific table in page with multiple tables

import requests
from bs4 import BeautifulSoup
results = requests.get("https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists")
src = results.content
soup = BeautifulSoup(src, 'lxml')
trs = soup.find_all("tr")
for tr in trs:
print(tr.text)
This is the code I write for the scraping table from the page "https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists"
If I am only targeting the table in the session "List of most Olympic gold medals over career", how can I specify the table I need? There are 2 sortable jquery-tablesorter so I cannot use the class attribute to select the table I needed.
One more question, if I know that the page I am scraping contains a lot of tables and the one I need always have 10 td in 1 row, can I have something like
If len(td) == 10:
print(tr)
to extract the data I wanted
Update on code:
from bs4 import BeautifulSoup
results = requests.get("https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists")
src = results.content
soup = BeautifulSoup(src, 'lxml')
tbs = soup.find("tbody")
trs = tbs.find_all("tr")
for tr in trs:
print(tr.text)
I have one of the solution, not a good one, just to extract the first table from the page which is the one I needed, any suggestion/ improvement are welcomed!
Thank you.
To only get the first table you can use a CSS Selector nth-of-type(1):
import requests
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
table = soup.select_one("table.wikitable:nth-of-type(1)")
trs = table.find_all("tr")
for tr in trs:
print(tr.text)

What is the proper syntax for .find() in bs4?

I am trying to scrape the bitcoin price off of coinbase and cannot find the proper syntax. When I run the program (without the line with question marks) I get the block of html that I need, but I don't know how to narrow down and retrieve the price itself. Any help appreciated, thanks.
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
data = requests.get(url)
nicedata = data.text
soup = BeautifulSoup(nicedata, 'html.parser')
prettysoup = soup.prettify()
bitcoin = soup.find('h4', {'class':
'Header__StyledHeader-sc-1q6y56a-0 hZxUBM
TextElement__Spacer-sc-18l8wi5-0 hpeTzd'})
price = bitcoin.find('???')
print(price)
The attached image contains the html
To get text from item:
price = bitcoin.text
But this page has many items <h4> with this class but find() gets only first one and it has text Bitcoin, not price from your image. You may need find_all() to get list with all items and then you can use index [index] or slicing [start:end] to get some items, or you can use for-loop to work with every item on list.
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
all_h4 = soup.find_all('h4', {'class': 'Header__StyledHeader-sc-1q6y56a-0 hZxUBM TextElement__Spacer-sc-18l8wi5-0 hpeTzd'})
for h4 in all_h4:
print(h4.text)
It can be easier to work with data if you keep it in list of list or array or DataFrame. But to create list of lists it would be easier to find rows <tr> and inside every row search <h4>
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
all_tr = soup.find_all('tr')
data = []
for tr in all_tr:
row = []
for h4 in tr.find_all('h4'):
row.append(h4.text)
if row: # skip empty row
data.append(row)
for row in data:
print(row)
It doesn't need class to get all h4.
BTW: This page uses JavaScript to append new rows when you scroll page but requests and BeautifulSoup can't run JavaScript - so if you will need all rows then you may need Selenium to control web browser which runs JavaScript

Adding objects for each item added from scraping data from a website

I am trying to retrieve data from a website with and add for each row of data and object, I am new to python and I clearly miss something because I can get only 1 object, what Im trying to get is all the objects I get sorted by key value pairs:
import urllib.request
import bs4 as bs
url = 'http://freemusicarchive.org/search/?quicksearch=drake/'
search = ''
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urllib.request.urlopen(req).read()
soup = bs.BeautifulSoup(html, 'html.parser')
tracks_info = [{}]
spans = soup.find_all('span', {'class': 'ptxt-artist'})
for span in spans:
arts = span.find_all('a')
for art in arts:
print(art.text)
spans = soup.find_all('span', {'class': 'ptxt-track'})
for span in spans:
tracks = span.find_all('a')
for track in tracks:
print(track.text)
for download_links in soup.find_all('a', {'title': 'Download'}):
print(download_links.get('href'))
for info in tracks_info:
info.update({'artist': art.text})
info.update({'track': track.text})
info.update({'link': download_links.get('href')})
print(info)
I failed to add an object for each element I get from the website, Im clearly doing something wrong\or not doing and any help would be much appreciated!
You could use a slightly different struture and syntax such as below.
I use a contains CSS class selector to retrieve the rows of info as the id is different for each track
The CSS selector combination of div[class*="play-item gcol gid-electronic tid-"]
looks for div elements with class attribute having value containing play-item gcol gid-electronic tid-.
Within that the various columns of interest are then selected by their class name and a descendant css selector is used for the a tag element for the final download link.
import urllib.request
import bs4 as bs
import pandas as pd
url = 'http://freemusicarchive.org/search/?quicksearch=drake/'
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urllib.request.urlopen(req).read()
soup = bs.BeautifulSoup(html, 'html.parser')
tracks_Info = []
headRow = ['Artist','TrackName','DownloadLink']
for item in soup.select('div[class*="play-item gcol gid-electronic tid-"]'):
tracks_Info.append([item.select_one(".ptxt-artist").text.strip(), item.select_one(".ptxt-track").text, item.select_one(".playicn a").get('href')])
df = pd.DataFrame(tracks_Info,columns=headRow)
print(df)

Categories