Cannot scrape web with many tables with python lxml - python

I am trying to scrape this web, but i am not getting any result, this works with other pages in which there´s only one simple table. Can you help me with the code?
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
import urllib
def scrape_table(url):
# Fetch the page that we're going to parse
page = requests.get(url)
tree = html.fromstring(page.content)
# Using XPATH, fetch all table elements on the page
#df = tree.xpath('//div[#id="main content"]/div[#id="style-1"]/table[#class="table"]/tbody')
df = tree.xpath('//tr')
#assert len(table) == 1
#df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0]
return df
symbol = 'AMZN'
#balance_sheet_url = 'https://finance.yahoo.com/quote/' + symbol + '?p=' + symbol
#df_balance_sheet = scrape_table(balance_sheet_url)
#df_balance_sheet.info()
#print(df_balance_sheet)
url = "https://www.macrotrends.net/stocks/charts/"+ symbol + "/pe-ratio"
data = requests.request("GET", url)
url_completo = data.url
print(url_completo)
df_pe = scrape_table(url_completo)
Here is the web i am trying to scrape (Code) web:https://www.macrotrends.net/stocks/charts/TMO/thermo-fisher-scientific/pe-ratio
<div id="style-1" style="background-color:#fff; height: 500px; overflow:auto; margin: 0px 0px 30px 0px; padding:0px 30px 20px 0px; border:1px solid #dfdfdf;">
<table class="table">
<thead>
<tr>
<th colspan="4" style="text-align:center;">Thermo Fisher Scientific PE Ratio Historical Data</th>
</tr>
</thead>
<thead>
<tr>
<th style="text-align:center;">Date</th>
<th style="text-align:center;">Stock Price</th>
<th style="text-align:center;">TTM Net EPS</th>
<th style="text-align:center;">PE Ratio</th>
</tr>
</thead>
<tbody><tr>
<td style="text-align:center;">2019-04-12</td>
<td style="text-align:center;">280.65</td>
<td style="text-align:center;"></td>
<td style="text-align:center;">38.71</td>
</tr><tr>
<td style="text-align:center;">2018-12-31</td>
<td style="text-align:center;">223.79</td>
<td style="text-align:center;">$7.25</td>
<td style="text-align:center;">30.87</td>
</tr><tr>
<td style="text-align:center;">2018-09-30</td>
<td style="text-align:center;">243.90</td>
<td style="text-align:center;">$6.33</td>
<td style="text-align:center;">38.53</td>
</tr><tr>
<td style="text-align:center;">2018-06-30</td>
<td style="text-align:center;">206.84</td>
<td style="text-align:center;">$5.92</td>
<td style="text-align:center;">34.94</td>
</tr>
</table>
</div>```

You have not built you URLs correctly. This code will fetch two tables one for amazon then the next for thermo-fisher-scientific.
import lxml
from lxml import html
import requests
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
def scrape_table(url):
# Fetch the page that we're going to parse
page = requests.get(url)
tree = html.fromstring(page.content)
tables = tree.findall('.//*/table')
df = pd.read_html(lxml.etree.tostring(tables[0], method='html'))[0]
return df
for symbol in ['AMZN/amazon', 'TMO/thermo-fisher-scientific']:
url = "https://www.macrotrends.net/stocks/charts/" + symbol + "/pe-ratio"
data = requests.request("GET", url)
url_completo = data.url
print(url_completo)
df_pe = scrape_table(url_completo)
print(df_pe)
Outputs:
Amazon PE Ratio Historical Data
Date Stock Price TTM Net EPS PE Ratio
0 2019-04-12 1843.06 NaN 91.56
1 2018-12-31 1501.97 $20.13 74.61
2 2018-09-30 2003.00 $17.84 112.28
...
Thermo Fisher Scientific PE Ratio Historical Data
Date Stock Price TTM Net EPS PE Ratio
0 2019-04-12 280.65 NaN 38.71
1 2018-12-31 223.79 $7.25 30.87
2 2018-09-30 243.90 $6.33 38.53
...

Related

Getting data out of a <TD> Element using Python

I am writing an agent for plex and I am scraping the following html table
I am rather new to python and web scraping in general
I am trying to get to the data XXXXXXXXXX
THE DATA
<table class="d">
<tbody>
<tr>
<th class="ch">title</th>
<th class="ch">released</th>
<th class="ch">company</th>
<th class="ch">type</th>
<th class="ch">rating</th>
<th class="ch">category</th>
</tr>
<tr>
<td class="cd" valign="top">
XXXXXXXXXX
</td>
<td class="cd">2015</td>
<td class="cd">My Films</td>
<td class="cd"> </td>
<td class="cd"> </td>
<td class="cd">General Hardcore</td>
</tr>
</tbody>
</table>
THE CODE
This is a segment of the code I am using :
myTable = HTML.ElementFromURL(searchQuery, sleep=REQUEST_DELAY).xpath('//table[contains(#class,"d")]/tr')
self.log('SEARCH:: My Table: %s', myTable)
# This logs the following
#2019-12-26 00:26:49,329 (17a4) : INFO (logkit:16) - GEVI - SEARCH:: My Table: [<Element tr at 0x5225c30>, <Element tr at 0x5225c00>]
for myRow in myTable:
siteTitle = title[0]
self.log('SEARCH:: Site Title: %s', siteTitle)
siteTitle = title[0].text_content().strip()
self.log('SEARCH:: Site Title: %s', siteTitle)
# This logs the following for <tr>/<th> - ROW 1
# 2019-12-26 00:26:49,335 (17a4) : INFO (logkit:16) - GEVI - SEARCH:: Site Title: <Element th at 0x5225180>
# 2019-12-26 00:26:49,342 (17a4) : INFO (logkit:16) - GEVI - SEARCH:: Site Title: title
# This logs the following for <tr>/<th> - ROW 2
# 2019-12-26 00:26:49,362 (17a4) : INFO (logkit:16) - GEVI - SEARCH:: Site Title: <Element td at 0x52256f0>
# 2019-12-26 00:26:49,369 (17a4) : INFO (logkit:16) - GEVI - SEARCH:: Site Title: #### this is my issue... should be XXXXXXXXXX
# I can get the href using the following code
siteURL = myRow.xpath('.//td/a')[0].get('href')
THE QUESTIONS
A. How do I get the value 'XXXXXXXXXX', I tried using xPath but it picked up data from another table on the same page
B. Is There a better way of getting the href attribute?
OTHER
The python libraries I am using are
import datetime, linecache, platform, os, re, string, sys, urllib
I can not use beautifulsoup as this is an agent for plex and therefore i am assuming that whoever wanted to use this agent would have to install beautifulsoup.
so that is a no go
How's this?
from simplified_scrapy.simplified_doc import SimplifiedDoc
html = '''<table class="d">
<tbody>
<tr>
<th class="ch">title</th>
<th class="ch">released</th>
<th class="ch">company</th>
<th class="ch">type</th>
<th class="ch">rating</th>
<th class="ch">category</th>
</tr>
<tr>
<td class="cd" valign="top">
XXXXXXXXXX
</td>
<td class="cd">2015</td>
<td class="cd">My Films</td>
<td class="cd"> </td>
<td class="cd"> </td>
<td class="cd">General Hardcore</td>
</tr>
</tbody>
</table>'''
doc = SimplifiedDoc(html)
table = doc.getElement('table','d') # doc.getElement(tag='table',attr='class',value='d')
trs = table.trs.contains('<a ') # table.getElementsByTag('tr').contains('<a ')
for tr in trs:
a = tr.a
print (a)
print (a.text) # XXXXXXXXXX

Scraping with requests and BS4

I'd like to get the content in the table to then put in a pandas dataframe in the following website: https://projects.fivethirtyeight.com/soccer-predictions/premier-league/
I'm quite new to BS, but I believe that what I want would be something like:
import requests
from bs4 import BeautifulSoup
r = requests.get(url = "https://projects.fivethirtyeight.com/soccer-predictions/ligue-1/")
soup = BeautifulSoup(r.text, "html.parser")
#print(soup.prettify())
print(soup.find("div", {"class":"forecast-table"}))
But of course, unfortunately this is returning "None". Any help and guidance would be amazing!
I believe that the bit I need to get is somewhere in here (not really sure though):
<div id="forecast-table-wrapper">
<table class="forecast-table" id="forecast-table">
<thead>
<tr class="desktop">
<th class="top nosort">
</th>
<th class="top bordered-right rating nosort drop-6" colspan="3">
Team rating
</th>
<th class="top nosort rating2" colspan="1">
</th>
<th class="top bordered-right nosort drop-1" colspan="5">
avg. simulated season
</th>
<th class="top bordered-right nosort show-1 drop-3" colspan="2">
avg. simulated season
</th>
<th class="top bordered nosort" colspan="4">
end-of-season probabilities
</th>
</tr>
<tr class="sep">
<th colspan="11">
</th>
</tr>
Since you're using pandas anyway, you can use the built-in table processing, like this:
pandas.read_html('https://projects.fivethirtyeight.com/soccer-predictions/premier-league/',
attrs = {
'class': 'forecast-table'
}, header = 1)
That's because you are searching for a div, but it's a table, so it should be:
print(soup.find("table", {"class":"forecast-table"}))
import requests
from bs4 import BeautifulSoup
r = requests.get('https://projects.fivethirtyeight.com/soccer-predictions/ligue-1/')
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('table', attrs={'class':'forecast-table'})
for i in table:
tr = i.find_all('tr')
for l in tr:
print(l.text)
Output:
Team ratingavg. simulated seasonavg. simulated seasonend-of-season probabilities
teamspioff.def.WDLgoal diff.proj. pts.pts.relegatedrel.qualify for UCLmake UCLwin Ligue 1win league
PSG24 pts90.03.00.530.74.52.9+7897<1%>99%97%
Lyon14 pts76.32.10.719.69.19.3+2768<1%60%2%
Marseille13 pts71.12.00.918.38.311.4+1663<1%40%<1%
Lille19 pts63.71.70.916.78.612.6+9591%24%<1%
St Étienne15 pts62.71.60.914.710.912.4-1553%14%<1%
Montpellier16 pts64.01.50.713.912.411.7+2543%12%<1%
Nice11 pts62.01.60.913.510.014.5-7507%7%<1%
Monaco6 pts65.91.80.913.010.714.2+0508%7%<1%
Rennes8 pts63.41.60.813.010.514.5-3499%6%<1%
Bordeaux14 pts59.21.50.913.09.915.0-6498%5%<1%
Strasbourg12 pts59.21.51.012.610.814.6-2499%5%<1%
Angers11 pts60.41.50.912.610.215.2-54810%4%<1%
Toulouse13 pts58.21.50.911.912.014.1-104811%4%<1%
Dijon FCO10 pts57.71.61.112.28.517.3-124517%2%<1%
Caen10 pts55.61.41.010.812.414.8-104518%3%<1%
Nîmes10 pts54.91.51.110.711.615.6-134420%2%<1%
Reims10 pts55.31.30.910.312.315.4-144321%2%<1%
Nantes6 pts59.01.50.910.410.916.7-144225%1%<1%
Guingamp5 pts57.31.51.010.39.817.9-194130%<1%<1%
Amiens10 pts53.01.31.010.49.018.6-164031%<1%<1%

Beautiful Soup scrape for "Worldwide"

I'm trying to scrape some Box Office Mojo pages for Worldwide box office gross figures using Beautiful Soup.My code below will grab the Domestic figures just fine, won't work when I sub in "Worldwide" for "Domestic Total Gross." Maybe because "Worldwide" show's up on the page more than once or something.
Any help on fixing it? I'll past the source code for the two portions as well. Thanks!
Source code below
<center><table border="0" border="0" cellspacing="1" cellpadding="4" bgcolor="#dcdcdc" width="95%"><tr bgcolor="#ffffff"><td align="center" colspan="2"><font size="4">Domestic Total Gross: <b>$172,825,435</b></font></td></tr><tr bgcolor="#ffffff"><td valign="top">Distributor: <b>MGM</b></td><td valign="top">Release Date: <b><nobr>December 16, 1988</nobr></b></td></tr><tr bgcolor="#ffffff"><td valign="top">Genre: <b>Drama</b></td><td valign="top">Runtime: <b>2 hrs. 13 min.</b></td></tr><tr bgcolor="#ffffff"><td valign="top">MPAA Rating: <b>R</b></td><td valign="top">Production Budget: <b>$25 million</b></td></tr></table> </td>
...skip...
<tr>
<td width="40%">= <b>Worldwide:</b></td>
<td width="35%" align="right"> <b>$354,825,435</b></td>
<td width="25%"> </td>
</tr>
Python code below
BOG_titles = ['=RainMan.htm']
def get_movie_value(soup, field_name):
obj = soup.find(text = re.compile(field_name))
if not obj:
return "Nothing"
next_sibling = obj.findNextSibling()
if next_sibling:
return next_sibling.text
else:
return "Still Nothing"
BOG_data = []
for x in BOG_titles:
y = 'http://www.boxofficemojo.com/movies/?id' + x
page = urllib2.urlopen(y)
soup = BeautifulSoup(page)
m = get_movie_value(soup, "Worldwide")
title_string = soup.find('title').text
title = title_string.split('(')[0].strip()
BOG_data.append([title,m])
Use the table inside the div.mp_box structure to get what you want:
In [1]: from bs4 import BeautifulSoup
In [2]: import requests
In [3]: r = requests.get("http://www.boxofficemojo.com/movies/?id=rainman.htm").content
In [4]: soup = BeautifulSoup(r,"lxml")
In [5]: table = soup.select_one("div.mp_box table")
In [6]: print(table)
<table border="0" cellpadding="0" cellspacing="0">
<tr>
<td width="40%"><b>Domestic:</b></td>
<td align="right" width="35%"> <b>$172,825,435</b></td>
<td align="right" width="25%">   <b>48.7%</b></td>
</tr>
<tr>
<td width="40%">+ Foreign:</td>
<td align="right" width="35%"> $182,000,000</td>
<td align="right" width="25%">   51.3%</td>
</tr>
<tr>
<td colspan="3" width="100%"><hr/></td>
</tr>
<tr>
<td width="40%">= <b>Worldwide:</b></td>
<td align="right" width="35%"> <b>$354,825,435</b></td>
<td width="25%"> </td>
</tr>
</table>
In [7]: rows = table.select("tr")
In [8]: rows[0].select_one("td + td").text
Out[8]: u'\xa0$172,825,435'
In [9]: rows[1].select_one("td + td").text
Out[9]: u'\xa0$182,000,000'
In [10]: rows[-1].select_one("td + td").text
Out[10]: u'\xa0$354,825,435'
To use the text without specifying the row:
In [27]: soup = BeautifulSoup(r,"lxml")
In [28]: table = soup.select_one("div.mp_box table")
In [29]: print(table.find("b", text="Domestic:").find_next("td").text)
 $172,825,435
In [30]: print(table.find("b", text="Worldwide:").find_next("td").text)
 $354,825,435
In [31]: print(table.find("a", text="Foreign:").find_next("td").text)
 $182,000,000

Scrape specific NHL score with Python Beautifulsoup

I am trying to scrape only the total score for a specified team. I have written the following:
import urllib.request
import re
from bs4 import BeautifulSoup
#url1 = "http://scores.nbcsports.com/nhl/scoreboard.asp"
## This works, however is using a set day for testing, will need url changed to url1 for current day scoreboard
url = "http://scores.nbcsports.com/nhl/scoreboard.asp?day=20141202"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
allrows = soup.findAll('td')
userows = [t for t in allrows if t.findAll(text=re.compile('Vancouver'))]
print(userows)
This returns:
[<td><table cellspacing="0"><tr class="shsTableTtlRow"><td class="shsNamD" colspan="1">Final</td>
<td class="shsTotD">1</td>
<td class="shsTotD">2</td>
<td class="shsTotD">3</td>
<td class="shsTotD">Tot</td>
</tr>
<tr>
<td class="shsNamD" nowrap=""><span class="shsLogo"><span class="shsNHLteam22sm_trans"></span></span>Vancouver</td>
<td class="shsTotD">1</td>
<td class="shsTotD">2</td>
<td class="shsTotD">1</td>
<td class="shsTotD">4</td>
</tr>
<tr>
<td class="shsNamD" nowrap=""><span class="shsLogo"><span class="shsNHLteam23sm_trans"></span></span>Washington</td>
<td class="shsTotD">0</td>
<td class="shsTotD">2</td>
<td class="shsTotD">1</td>
<td class="shsTotD">3</td>
</tr>
</table>
</td>, <td class="shsNamD" nowrap=""><span class="shsLogo"><span class="shsNHLteam22sm_trans"></span></span>Vancouver</td>]
What I can't seem to get to is the 4 in <td class="shsTotD">4</td> from the middle block. If it is only possible to get the 1 2 1 4 I could compare the values and always pick the largest, but I can't even seem to get that far. Thanks in advance.
Find the tag containing Vancouver and get the next td tags by using find_next_siblings():
vancouver = soup.find('a', text='Vancouver')
for td in vancouver.parent.find_next_siblings('td', class_='shsTotD'):
print(td.text)
Prints:
1
2
1
4

python beautiful soup extract data

I am parsing a html document using a Beautiful Soup 4.0.
Here is an example of table in document
<tr>
<td class="nob"></td>
<td class="">Time of price</td>
<td class=" pullElement pullData-DE000BWB14W0.teFull">08/06/2012</td>
<td class=" pullElement pullData-DE000BWB14W0.PriceTimeFull">11:43:08 </td>
<td class="nob"></td>
</tr>
<tr>
<td class="nob"></td>
<td class="">Daily volume (units)</td>
<td colspan="2" class=" pullElement pullData-DE000BWB14W0.EWXlume">0</td>
<td class="nob"></td>
<t/r>
I would like to extract 08/06/2012 and 11:43:08 DAily volume, 0 etc.
This is my code to find specific table and all data of it
html = file("some_file.html")
soup = BeautifulSoup(html)
t = soup.find(id="ctnt-2308")
dat = [ map(str, row.findAll("td")) for row in t.findAll("tr") ]
I get a list of data that needs to be organized
Any suggestions to do it in a simple way??
Thank you
list(soup.stripped_strings)
will give you all the string in that soup (removing all trailing spaces)

Categories