I'm trying to use BeautifulSoup to select the date, url, description, and additional url from table and am having trouble accessing them given the weird white spaces:
So far I've written:
import urllib
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup('https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2010.shtml')
test1 = soup.findAll("td", {"nowrap" : "nowrap"})
test2 = [item.text.strip() for item in test1]
With bs4 4.7.1 you can use :has and nth-of-type in combination with next_sibling to get those columns
from bs4 import BeautifulSoup
import requests, re
def make_soup(url):
the_page = requests.get(url)
soup_data = BeautifulSoup(the_page.content, "html.parser")
return soup_data
soup = make_soup('https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2010.shtml')
releases = []
links = []
dates = []
descs = []
addit_urls = []
for i in soup.select('td:nth-of-type(1):has([href^="/litigation/litreleases/"])'):
sib_sib = i.next_sibling.next_sibling.next_sibling.next_sibling
releases+= [i.a.text]
links+= [i.a['href']]
dates += [i.next_sibling.next_sibling.text.strip()]
descs += [re.sub('\t+|\s+',' ',sib_sib.text.strip())]
addit_urls += ['N/A' if sib_sib.a is None else sib_sib.a['href']]
result = list(zip(releases, links, dates, descs, addit_urls))
print(result)
Unfortunately there is no class or id HTML attribute to quickly identify the table to scrape; after experimentation I found it was the table at index 4.
Next we ignore the header by separating it from the data, which still has table rows that are just separations for quarters. We can skip over these using a try-except block since those only contain one table data tag.
I noticed that the description is separated by tabs, so I split the text on \t.
For the urls, I used .get('href') rather than ['href'] since not every anchor tag has an href attribute from my experience scraping. This avoids errors should that case occur. Finally the second anchor tag does not always appear, so this is wrapped in a try-except block as well.
data = []
table = soup.find_all('table')[4] # target the specific table
header, *rows = table.find_all('tr')
for row in rows:
try:
litigation, date, complaint = row.find_all('td')
except ValueError:
continue # ignore quarter rows
id = litigation.text.strip().split('-')[-1]
date = date.text.strip()
desc = complaint.text.strip().split('\t')[0]
lit_url = litigation.find('a').get('href')
try:
comp_url = complaint.find('a').get('href')
except AttributeError:
comp_ulr = None # complaint url is optional
info = dict(id=id, date=date, desc=desc, lit_url=lit_url, comp_url=comp_url)
data.append(info)
Related
I'm performing some data analysis for my own knowledge from nhl spread/betting odds information. I'm able to pull some information, but Not the entire data set. I want to pull the list of games and the associated into a panda dataframe, but I have been able to perform the proper loop around the html tags. I've tried the findAll option and the xpath route. I'm not successful with either.
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
name_box = page_content.find('div', attrs={'class': 'datarow'})
name = name_box.text.strip()
print (name)
This script goes through each datarow and pulls out each item individually and then appends them into a pandas DataFrame.
from bs4 import BeautifulSoup
import requests
import pandas as pd
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
tables = page_content.find_all('div', class_='datarow')
# Iterate through rows
rows = []
# Iterate through each datarow and pull out each home/away separately
for table in tables:
# Get time and date
time_and_date_tag = table.find_all('div', attrs={"class": "time"})[0].contents
date = time_and_date_tag[1]
time = time_and_date_tag[-1]
# Get teams
teams_tag = table.find_all('div', attrs={"class": "datacell teams"})[0].contents[-1].contents
home_team = teams_tag[1].text
away_team = teams_tag[-1].text
# Get opening
opening_tag = table.find_all('div', attrs={"class": "child-open"})[0].contents
home_open_value = opening_tag[1]
away_open_value = opening_tag[-1]
# Get current
current_tag = table.find_all('div', attrs={"class": "child-current"})[0].contents
home_current_value = current_tag[1]
away_current_value = current_tag[-1]
# Create list
rows.append([time, date, home_team, away_team,
home_open_value, away_open_value,
home_current_value, away_current_value])
columns = ['time', 'date', 'home_team', 'away_team',
'home_open', 'away_open',
'home_current', 'away_current']
print(pd.DataFrame(rows, columns=columns))
Here is my solution to your question.
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
for cell in page_content.find_all('div', attrs={'class': 'datarow'}):
name = cell.text.strip()
print (name)
I am currently running the following python script:
import requests
from bs4 import BeautifulSoup
origin= ["USD","GBP","EUR"]
i=0
while i < len(origin):
page = requests.get("https://www.x-rates.com/table/?from="+origin[i]+"&amount=1")
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.findChildren('table')
my_table = tables[0]
rows = my_table.findChildren(['td'])
i = i +1
for rows in rows:
cells = rows.findChildren('a')
for cell in cells:
value = cell.string
print(value)
To scrape data from this HTML:
https://i.stack.imgur.com/DkX83.png
The problem I have is that I'm struggling to only scrape the first column without scraping the second one as well because they are both under tags and in the same table row as each other. The href is the only thing which differentiates between the two tags and I have tried filtering using this but it doesn't seem to work and returns a blank value. Also when i try to sort the data manually the output is amended vertically and not horizontally, I am new to coding so any help would be appreciated :)
There is another way you might wanna try as well to achieve the same:
import requests
from bs4 import BeautifulSoup
keywords = ["USD","GBP","EUR"]
for keyword in keywords:
page = requests.get("https://www.x-rates.com/table/?from={}&amount=1".format(keyword))
soup = BeautifulSoup(page.content, "html.parser")
for items in soup.select_one(".ratesTable tbody").find_all("tr"):
data = [item.text for item in items.find_all("td")[1:2]]
print(data)
It is easier to follow what happens when you print every item you got from the top e.g. in this case from table item. The idea is to go one by one so you can follow.
import requests
from bs4 import BeautifulSoup
origin= ["USD","GBP","EUR"]
i=0
while i < len(origin):
page = requests.get("https://www.x-rates.com/table/?from="+origin[i]+"&amount=1")
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.findChildren('table')
my_table = tables[0]
i = i +1
rows = my_table.findChildren('tr')
for row in rows:
cells = row.findAll('td',class_='rtRates')
if len(cells) > 0:
first_item = cells[0].find('a')
value = first_item.string
print(value)
cannot get the span text within the "table", thanks !
from bs4 import BeautifulSoup
import urllib2
url1 = "url"
content1 = urllib2.urlopen(url1).read()
soup = BeautifulSoup(content1,"lxml")
table = soup.findAll("div", {"class" : "iw_component","id":"c1417094965154"})
rows = table.find_all('span',recursive=False)
for row in rows:
print(row.text)
table = soup.findAll("div", {"class" : "iw_component","id":"c1417094965154"})
In the above line, findAll() returns a list.
So, in the next line you are getting the error because its expecting an HTML string.
If you expect only one table, try using the following code. Just replace
rows = table.find_all('span',recursive=False)
with
rows = table[0].find_all('span')
If you expect multiple tables in the page, run a for loop on the table and then run the rest of the statements inside the for loop.
Also, for pretty output, you can replace the tabs with spaces as in the following code:
row = row.get_text()
row = row.replace('\t', '')
print(row)
The final working code for you is:
from bs4 import BeautifulSoup
import urllib2
url1 = "url"
content1 = urllib2.urlopen(url1).read()
soup = BeautifulSoup(content1,"lxml")
table = soup.findAll("div", {"class" : "iw_component","id":"c1417094965154"})
rows = table[0].find_all('span')
for row in rows:
row_str = row.get_text()
row_str = row_str.replace('\t', '')
print(row_str)
Regarding recursive=False parameter, if it's set to false, it will only find in direct children which, in your case will give no result.
Recursive Argument in find()
If you only want Beautiful Soup to consider direct children, you can pass in recursive=False
Here's another approach using lxml instead of beautifulsoup:
import requests
from lxml import html
req = requests.get("<URL>")
raw_html = html.fromstring(req.text)
spans = raw_html.xpath('//div[#id="c1417094965154"]//span/text()')
print("".join([x.replace("\t", "").replace("\r\n","").strip() for x in spans]))
Output: Kranji Mile Day simulcast races, Kranji Racecourse, SINClass 3 Handicap - 1200M TURFSaturday, 26 May 2018Race 1, 5:15 PM
As you see, the output need a little formatting, spans is a list of all spans text, so you can do any processing you need.
You seem to use python 2.x, here is a python 3.x solution, since I do not have a python 2.x environment at the moment :
from bs4 import BeautifulSoup
import urllib.request as urllib
url1 = "<URL>"
# Read the HTML page
content1 = urllib.urlopen(url1).read()
soup = BeautifulSoup(content1, "lxml")
# Find the div (there is only one, so you do not need findAll) -> this is your problem
div = soup.find("div", class_="iw_component", id="c1417094965154")
# Now you retrieve all the span within this div
rows = div.find_all("span")
# You can do what you want with it !
line = ""
for row in rows:
row_str = row.get_text()
row_str = row_str.replace('\t', '')
line += row_str + ", "
print(line)
I want to build a list with coins from coinmarketcap.com.
Every element should be a tuple.
Something like:
coins = [('btc',8500,'+0.5%','+1.2%', '-1%'), ...]
I can't get percentage:
The information is in td like this:
<td class="no-wrap percent-change text-right positive_change" data-timespan="1h" data-percentusd="0.99" data-symbol="BTC" data-sort="0.991515">0.99%</td>
How can I access 0.99% value from above? I need in fact data-percentageusd from td but I don't know what that is.
My testing script is something like:
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/all/views/all/'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
name = soup.find_all('a', class_='currency-name-container')
price = soup.find_all('a', class_='price')
print(name)
print(price)
#how can percentage modification for 1h, 24h, 7d?
#delta_h = soup.find_all('td', ???)
You can loop over the rows of the table to get the data for each currency and store it in a tuple, and then add it to the list.
r = requests.get('https://coinmarketcap.com/all/views/all/')
soup = BeautifulSoup(r.text, 'lxml')
data = []
table = soup.find('table', id='currencies-all')
for row in table.find_all('tr'):
try:
symbol = row.find('td', class_='text-left col-symbol').text
price = row.find('a', class_='price').text
time_1h = row.find('td', {'data-timespan': '1h'}).text
time_24h = row.find('td', {'data-timespan': '24h'}).text
time_7d = row.find('td', {'data-timespan': '7d'}).text
except AttributeError:
continue
data.append((symbol, price, time_1h, time_24h, time_7d))
for item in data:
print(item)
Partial Output:
('BTC', '$8805.46', '0.88%', '-12.30%', '-19.95%')
('ETH', '$677.45', '0.98%', '-11.54%', '-21.66%')
('XRP', '$0.780113', '0.62%', '-10.63%', '-14.42%')
('BCH', '$970.70', '1.01%', '-11.33%', '-23.89%')
('LTC', '$166.70', '0.74%', '-10.06%', '-19.56%')
('NEO', '$83.55', '0.24%', '-16.29%', '-33.39%')
('XLM', '$0.286741', '1.13%', '-13.23%', '-11.84%')
('ADA', '$0.200449', '0.63%', '-16.92%', '-31.43%')
('XMR', '$256.92', '0.63%', '-19.98%', '-19.46%')
Since the data is missing for some currencies in the table, the code will raise an AttributeError for .text. To skip those currencies, I've used the try-except.
I am attempting a simple scrape of an HTML table using BeautifulSoup with the following:
import urllib
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
page = urllib.request.urlopen(url)
sdata = BeautifulSoup(page, 'html.parser')
return sdata
url = 'http://www.satp.org/satporgtp/countries/pakistan/database/bombblast.htm'
soup = make_soup(url)
table = soup.findAll('table', attrs={'class':'pagraph1'})
table = table[0]
trows = table.findAll('tr')
bbdata_ = []
bbdata = []
for trow in trows:
bbdata_ = trow.findAll('td')
bbdata = [ele.text.strip() for ele in bbdata_]
print(bbdata)
However, I can only extract the last row in the table, i.e.
['Total*', '369', '1032+']
All of the data is included in the trows, so I must be forming my loop incorrectly, but I am not sure how.
Your problem is here:
bbdata = [ele.text.strip() for ele in bbdata_]
You want to append to the list or extend it:
bbdata.append([ele.text.strip() for ele in bbdata_])
You are overwriting bbdata each time through the loop which is why it ends up only with the final value.