Wikipedia table scraping using python - python

I am trying to scrape tables from wikipedia. I wrote a table scraper that downloads a table and saves it as a pandas data frame.
This is the code
from bs4 import BeautifulSoup
import pandas as pd
import urllib2
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population', None, headers)
html = urllib2.urlopen(req).read()
soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
print soup
# Create an object of the first object
table = soup.find("table", {"class":"wikitable sortable jquery-tablesorter"})
print table
rank=[]
country=[]
pop=[]
date=[]
per=[]
source=[]
for row in table.find_all('tr')[1:]:
col=row.find_all('td')
col1=col[0].string.strip()
rank.append(col1)
col2=col[1].string.strip()
country.append(col2)
col3=col[2].string.strip()
pop.append(col2)
col4=col[3].string.strip()
date.append(col4)
col5=col[4].string.strip()
per.append(col5)
col6=col[5].string.strip()
source.append(col6)
columns={'Rank':rank,'Country':country,'Population':pop,'Date':date,'Percentage':per,'Source':source}
# Create a dataframe from the columns variable
df = pd.DataFrame(columns)
df
But it is not downloading the table. The problem is in this section
table = soup.find("table", {"class":"wikitable sortable jquery-tablesorter"})
print table
where output is None

As far as I can see, there is no such element on that page. The main table has "class":"wikitable sortable" but not the jquery-tablesorter.
Make sure you know what element you are trying to select and check if your program sees the same elements you see, then make your selector.

The docs says you need to specify multiple classes like so:
soup.find("table", class_="wikitable sortable jquery-tablesorter")
Also, consider using requests instead of urllib2.

Related

How to scrape table in specific subsection of a page?

I'm trying to scrape a specific table from a page containing multiple tables. The url I'm using includes the subsection where the table is located.
So far I tried scraping all tables and select the one I need manually
wikiurl = 'https://en.wikipedia.org/wiki/2011_in_Strikeforce#Strikeforce_Challengers:_Britt_vs._Sayers'
response=requests.get(wikiurl)
soup = BeautifulSoup(response.text, 'html.parser')
table_class = "toccolours"
table = soup.find_all('table', table_class) # find all tables
# and pick right one
df=pd.read_html(str(table[15]))
Is it possible to use the information in the url #Strikeforce_Challengers:_Britt_vs._Sayers to only scrape the table in this section?
You are on the way - Simply split() url once by #, last element from result by _ and join() the elements to use them in the css selector with :-soup-contains():
table = soup.select_one(f'h2:-soup-contains("{" ".join(url.split("#")[-1].split("_"))}") ~ .toccolours')
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/2011_in_Strikeforce#Strikeforce_Challengers:_Britt_vs._Sayers'
response = requests.get(url)
soup = BeautifulSoup(response.content)
table = soup.select_one(f'h2:-soup-contains("{" ".join(url.split("#")[-1].split("_"))}") ~ .toccolours')
pd.read_html(str(table))[0]

Parsing data from Wikipedia table

I want to parse data from wikipedia table, and turn in into a pandas dataframe.
https://en.wikipedia.org/wiki/MIUI
there is a table called 'version history'
so far I have written following code, but still can't get the data
wiki='https://en.wikipedia.org/wiki/MIUI'
table_class='wikitable sortable mw-collapsible mw-no-collapsible jquery-tablesorter
mw-made-collapsible'
response = requests.get(wiki)
soup = BeautifulSoup(response.text,'html.parser')
miui_v = soup.find('table', attrs={'class': table_class})
In html I downloaded table you are searching for has different class:
class="wikitable mw-collapsible mw-made-collapsible"
I guess it can changes dependend on some browser and their extensions. I recommend to start with element that has id to guarantee match. In your case you can do:
miui_v = soup.find("div", {"id": "mw-content-text"})
my_table = miui_v.findChildren("div")[0].findChildren("table")[1]

Scraping table returns only “table” and not the contents of the table

Imgae description is here:
Scraping table returns only “table” and not the contents of the table.
Here is my code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://data.eastmoney.com/gdhs/detail/600798.html"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
table = soup.find_all('table')
print(table)
You found the table just fine with the code. Because the table is composed of multiple elements (tr/td), you have to loop through those to get the inner text of the table cells.
# This grabs the first occurrence of a table on the web page. If you want the second occurrence of a table on the web page, use soup.find_all('table')[1], etc.
table = soup.find_all('table')[0]
# Use a splice if there are table headers. If you want to include the table headers, use table('tr')[0:]
for row in table('tr')[1:]:
print(row('td')[0].getText().strip())

Unable to get all the names from a table

I've created a script in python to get all the names out of a table from a webpage. The names within that table are available in the page source so they are static content. However, when I try with my following script, I get few of them (upto 2012 Topps Heritage Run) whereas the list are many more.
Site address
How can I get all the names from the table under Company Sets header using requests?
I've tried with so far:
import requests
from bs4 import BeautifulSoup
url = "https://www.psacard.com/psasetregistry/baseball/company-sets/16"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".dataTable tr td a[href*='/baseball/company-sets/']"):
print(item.text)
Can you try the following:
print([inner_tag.find('a').text for inner_tag in soup.findAll('table')[0].findAll('td') if inner_tag.find('a')])
Explanation:
Actually there are two tables in the page, and your code was extracting values from both the tables. That's the reason why you were getting the last value 2012.
The above code extracts the text only from the first table named Company Sets
You could combine requests with pandas read_html
import pandas as pd
import requests
url = 'https://www.psacard.com/psasetregistry/baseball/company-sets/16'
headers = {'User-Agent' : 'Mozilla/5.0'}
r= requests.get(url, headers= headers)
tables = pd.read_html(r.content)
df = tables[0]
df.drop(df.index[[0]], inplace = True)
print(df)

Using BeautifulSoup to find a attribute called data-stats

I'm currently working on a web scraper that will allow me to pull stats from a football player. Usually this would be an easy task if I could just grab the divs however, this website uses a attribute called data-stats and uses it like a class. This is an example of that.
<th scope="row" class="left " data-stat="year_id">2000</th>
If you would like to check the site for yourself here is the link.
https://www.pro-football-reference.com/players/B/BradTo00.htm
I'm tried a few different methods. Either It won't work at all or I will be able to start a for loop and start putting things into arrays, however you will notice that not everything in the table is the same var type.
Sorry for the formatting and the grammer.
Here is what I have so far, I'm sure its not the best looking code, it's mainly just code I've tried on my own and a few things mixed in from searching on Google. Ignore the random imports I was trying different things
# import libraries
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import lxml.html as lh
import pandas as pd
# specify url
url = 'https://www.pro-football-reference.com/players/B/BradTo00.htm'
# request html
page = requests.get(url)
# Parse html using BeautifulSoup, you can use a different parser like lxml if present
soup = BeautifulSoup(page.content, 'lxml')
# find searches the given tag (div) with given class attribute and returns the first match it finds
headers = [c.get_text() for c in soup.find(class_ = 'table_container').find_all('td')[0:31]]
data = [[cell.get_text(strip=True) for cell in row.find_all('td')[0:32]]
for row in soup.find_all("tr", class_=True)]
tags = soup.find(data ='pos')
#stats = tags.find_all('td')
print(tags)
You need to use the get method from BeautifulSoup to get the attributes by name
See: BeautifulSoup Get Attribute
Here is a snippet to get all the data you want from the table:
from bs4 import BeautifulSoup
import requests
url = "https://www.pro-football-reference.com/players/B/BradTo00.htm"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# Get table
table = soup.find(class_="table_outer_container")
# Get head
thead = table.find('thead')
th_head = thead.find_all('th')
for thh in th_head:
# Get case value
print(thh.get_text())
# Get data-stat value
print(thh.get('data-stat'))
# Get body
tbody = table.find('tbody')
tr_body = tbody.find_all('tr')
for trb in tr_body:
# Get id
print(trb.get('id'))
# Get th data
th = trb.find('th')
print(th.get_text())
print(th.get('data-stat'))
for td in trb.find_all('td'):
# Get case value
print(td.get_text())
# Get data-stat value
print(td.get('data-stat'))
# Get footer
tfoot = table.find('tfoot')
thf = tfoot.find('th')
# Get case value
print(thf.get_text())
# Get data-stat value
print(thf.get('data-stat'))
for tdf in tfoot.find_all('td'):
# Get case value
print(tdf.get_text())
# Get data-stat value
print(tdf.get('data-stat'))
You can of course save the data in a csv or even a json instead of printing it
It's not very clear what exactly you're trying to extract, but this might help you a little bit:
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.pro-football-reference.com/players/B/BradTo00.htm'
page = requests.get(url)
soup = bs(page.text, "html.parser")
# Extract table
table = soup.find_all('table')
# Let's extract data from each row in table
for row in table:
col = row.find_all('td')
for c in col:
print(c.text)
Hope this helps!

Categories