Python3 Read Html Table With Pandas - python

Need some help here. Plan to extract all the statistical data of this site https://lotostats.ro/toate-rezultatele-win-for-life-10-20
My issue is that I am not able to read the table. I can't do this nor for the first page.
Can someone pls help?
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
url='https://lotostats.ro/toate-rezultatele-win-for-life-10-20'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
i+=1
name=t.text_content()
print ('%d:"%s"'%(i,name))
col.append((name,[]))
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
#T is our j'th row
T=tr_elements[j]
#If row is not of size 10, the //tr data is not from our table
# if len(T)!=10:
# break
#i is the index of our column
i=0
#Iterate through each element of the row
for t in T.iterchildren():
data=t.text_content()
#Check if row is empty
if i>0:
#Convert any numerical value to integers
try:
data=int(data)
except:
pass
#Append the data to the empty list of the i'th column
col[i][1].append(data)
#Increment i for the next column
i+=1
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df.head()
print(df)

Data is dynamically added. You can find the source, returning json, in network tab
import requests
r = requests.get('https://lotostats.ro/all-rez/win_for_life_10_20?draw=1&columns%5B0%5D%5Bdata%5D=0&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=false&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=1&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=false&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&start=0&length=20&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1564996040879').json()
You can decode that and likely (investigate that) remove timestamp part (or simply replace with random number)
import requests
r = requests.get('https://lotostats.ro/all-rez/win_for_life_10_20?draw=1&columns[0][data]=0&columns[0][name]=&columns[0][searchable]=true&columns[0][orderable]=false&columns[0][search][value]=&columns[0][search][regex]=false&columns[1][data]=1&columns[1][name]=&columns[1][searchable]=true&columns[1][orderable]=false&columns[1][search][value]=&columns[1][search][regex]=false&start=0&length=20&search[value]=&search[regex]=false&_=1').json()
To see the lottery lines:
print(r['data'])
The draw parameter seems to be related to page of draws e.g. 2nd page:
https://lotostats.ro/all-rez/win_for_life_10_20?draw=2&columns[0][data]=0&columns[0][name]=&columns[0][searchable]=true&columns[0][orderable]=false&columns[0][search][value]=&columns[0][search][regex]=false&columns[1][data]=1&columns[1][name]=&columns[1][searchable]=true&columns[1][orderable]=false&columns[1][search][value]=&columns[1][search][regex]=false&start=20&length=20&search[value]=&search[regex]=false&_=1564996040880
You can alter the length to retrieve more results. For example, I can deliberately oversize it to get all results
import requests
r = requests.get('https://lotostats.ro/all-rez/win_for_life_10_20?draw=1&columns[0][data]=0&columns[0][name]=&columns[0][searchable]=true&columns[0][orderable]=false&columns[0][search][value]=&columns[0][search][regex]=false&columns[1][data]=1&columns[1][name]=&columns[1][searchable]=true&columns[1][orderable]=false&columns[1][search][value]=&columns[1][search][regex]=false&start=0&length=100000&search[value]=&search[regex]=false&_=1').json()
print(len(r['data']))
Otherwise, you can set the length param to a set number, do an initial request, and calculate the number of pages from the total (r['recordsFiltered']) records count divided by results per page.
import math
total_results = r['recordsFiltered']
results_per_page = 20
num_pages = math.ceil(total_results/results_per_page)
Then do a loop to get all results (remembering to alter draw param). Obviously, the less requests the better.

Related

How to convert wikipedia tables into pandas dataframes? [duplicate]

This question already has answers here:
scraping data from wikipedia table
(3 answers)
Closed 2 years ago.
I want to apply some statistics to data tables obtained directly from specific internet pages.
This tutorial https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059 helped me creating a data frame from a table at the webpage http://pokemondb.net/pokedex/all. However, I want to do the same for geographic data, such as population and gdp of several countries.
I found some tables at wikipedia, but it doesn't work quite well and I don't understand why. Here's my code, that follows the above mentioned tutorial:
import requests
import lxml.html as lh
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_African_countries_by_population'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')
#Check the length of the first 12 rows
print('Length of first 12 rows')
print ([len(T) for T in tr_elements[:12]])
#Create empty list
col=[]
i=0 #For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
i+=1
name=t.text_content()
print ('%d:"%s"'%(i,name))
col.append((name,[]))
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
#T is our j'th row
T=tr_elements[j]
#If row is not of size 10, the //tr data is not from our table
if len(T)!=10:
break
#i is the index of our column
i=0
#Iterate through each element of the row
for t in T.iterchildren():
data=t.text_content()
#Check if row is empty
if i>0:
#Convert any numerical value to integers
try:
data=int(data)
except:
pass
#Append the data to the empty list of the i'th column
col[i][1].append(data)
#Increment i for the next column
i+=1
print('Data gathering: done!')
print('Column lentgh:')
print([len(C) for (title,C) in col])
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
print(df.head())
The output is the following:
Length of first 12 rows
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
1:"Ranks
"
2:"Countries(or dependent territory)
"
3:"Officialfigure(whereavailable)
"
4:"Date oflast figure
"
5:"Source
"
Data gathering: done!
Column lentgh:
[0, 0, 0, 0, 0]
Empty DataFrame
Columns: [Ranks
, Countries(or dependent territory)
, Officialfigure(whereavailable)
, Date oflast figure
, Source
]
Index: []
The length of the columns shouldn't be null. The format is not the same as the one of the tutorial. Any idea of how to make it right? Or maybe another data source that doesn't return this strange output format?
The length of your rows, as you've shown by your print statement in line 16 (which corresponds to the first line of your output), is not 10. It is 5. And your code breaks out of the loop in the very first iteration, instead of populating your col.
changing this statement:
if len(T)!=10:
break
to
if len(T)!=5:
break
should fix the problem.
Instead of using requests, use pandas to read the url data.
‘df = pd.read_html(url)
On Line 52 you are trying to edit a tuple. This is not possible in Python.
To correct this, use a list instead.
Change line 25 to col.append([name,[]])
In addition, when using the break it breaks the for loop, this causes it to have no data inside the array.
When doing these sorts of things you also must look at the html. The table isn't formatting as nice as one would hope. For example, it has a bunch of new lines, and also has the images of the countries flag. You can see this example of North America for how the format is different every time.
It seems like you want an easy way to do this.I would look into BeautifulSoup4. I have added a way that I would do this with bs4. You'll have to do some editing to make it look better
import requests
import bs4 as bs
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_African_countries_by_population'
column_names = []
data = []
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the html in the soup object
soup = bs.BeautifulSoup(page.content, 'html.parser')
#Gets the table html
table = soup.find_all('table')[0]
#gets the table header
thead = table.find_all('th')
#Puts the header into the column names list. We will use this for the dict keys later
for th in thead:
column_names.append(th.get_text())
#gets all the rows of the table
rows = table.find_all('tr')
#I do not take the first how as it is the header
for row in rows[1:]:
#Creates a list with each index being a different entry in the row.
values = [r for r in row]
#Gets each values that we care about
rank = values[1].get_text()
country = values[3].get_text()
pop = values[5].get_text()
date = values[7].get_text()
source = values[9].get_text()
temp_list = [rank,country,pop,date,source]
#Creates a dictionary with keys being the column names and the values being temp_list. Appends this to list data
data.append(dict(zip(column_names, temp_list)))
print(column_names)
df = pd.DataFrame(data)

How can I webscrape a Wikipedia table with lists of data instead of rows?

I am trying to get data from the Localities table located on the Wikipedia https://en.wikipedia.org/wiki/Districts_of_Warsaw page.
I would like to collect this data and put it into a dataframe with two columns ["Districts"] and ["Neighbourhoods"].
My code so far looks like this:
url = "https://en.wikipedia.org/wiki/Districts_of_Warsaw"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html")
table = soup.find_all('table')[2]
A=[]
B=[]
for row in table.findAll('tr'):
cells=row.findAll('td')
if len(cells)==2:
A.append(cells[0].find(text=True))
B.append(cells[1].find(text=True))
df=pd.DataFrame(A,columns=['Neighbourhood'])
df['District']=B
print(df)
This gives the following dataframe:
Dataframe
Certainly, scraping the Neighbourhood column is not right since they are contained in lists, but I don't know how it should be done so will be glad for any tips.
In addition to it, I will appreciate any hints why scraping gives me only 10 districts instead of 18.
Are you sure that you are scraping the right table? I understood that you need a second table with 18 districts and listed neighbourhoods.
Also, I'm not sure how you want to have districts and neighbourhoods arranged in a DataFrame, I've set districts as columns and neighbourhoods as rows. You can change it as you want.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://en.wikipedia.org/wiki/Districts_of_Warsaw"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
table = soup.find_all("table")[1]
def process_list(tr):
result = []
for td in tr.findAll("td"):
result.append([x.string for x in td.findAll("li")])
return result
districts = []
neighbourhoods = []
for row in table.findAll("tr"):
if row.find("ul"):
neighbourhoods.extend(process_list(row))
else:
districts.extend([x.string.strip() for x in row.findAll("th")])
# Check and arrange as you wish
for i in range(len(districts)):
print(f'District {districts[i]} has neighbourhoods: {", ".join(neighbourhoods[i])}')
df = pd.DataFrame()
for i in range(len(districts)):
df[districts[i]] = pd.Series(neighbourhoods[i])
Some tips:
Use element.string to get the text from an element
Use string.strip() to remove any leading (spaces at the beginning) and trailing (spaces at the end) characters (space is the default leading character to remove) i.e. to clean the text
You can use the fact that odd rows are the Districts and even rows are the Neighbourhoods to walk the odd rows and use FindNext to grab the neighbourhoods , from row below, whilst iterating the District columns within the odd rows:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from itertools import zip_longest
soup = bs(requests.get('https://en.wikipedia.org/wiki/Districts_of_Warsaw').content, 'lxml')
table = soup.select_one('h2:contains("Localities") ~ .wikitable') #isolate table of interest
results = []
for row in table.select('tr')[0::2]: #walk the odd rows
for i in row.select('th'): #walk the districts
r = list(zip_longest([i.text.strip()] , [i.text for i in row.findNext('tr').select('li')], fillvalue=i.text.strip())) # zip the current district to the list of neighbourhoods in row below. Fill with District name to get lists of equal length
results.append(r)
results = [i for j in results for i in j] #flatten list of lists
df = pd.DataFrame(results, columns= ['District','Neighbourhood'])
print(df)

Why is my for loop overwriting instead of appending CSV?

I am trying to scrape IB website. So, what I am doing, I have created the urls to iterate over, and I am able to extract the required information, but seems the dataframe keeps being overwritten vs appending.
import pandas as pd
from pandas import DataFrame as df
from bs4 import BeautifulSoup
import csv
import requests
base_url = "https://www.interactivebrokers.com/en/index.phpf=2222&exch=mexi&showcategories=STK&p=&cc=&limit=100"
n = 1
url_list = []
while n <= 2:
url = (base_url + "&page=%d" % n)
url_list.append(url)
n = n+1
def parse_websites(url_list):
for url in url_list:
html_string = requests.get(url)
soup = BeautifulSoup(html_string.text, 'lxml') # Parse the HTML as a string
table = soup.find('div',{'class':'table-responsive no-margin'}) #Grab the first table
df = pd.DataFrame(columns=range(0,4), index = [0]) # I know the size
for row_marker, row in enumerate(table.find_all('tr')):
column_marker = 0
columns = row.find_all('td')
try:
df.loc[row_marker] = [column.get_text() for column in columns]
except ValueError:
# It's a safe way when [column.get_text() for column in columns] is empty list.
continue
print(df)
df.to_csv('path_to_file\\test1.csv')
parse_websites(url_list)
Can you please take a look at my code at advise what I am doing wrong ?
One solution if you want to append the data frames on the file is to write in append mode:
df.to_csv('path_to_file\\test1.csv', mode='a', header=False)
otherwise you should create the data frame outside as mentioned in the comments.
If you define a data structure from within a loop, each iteration of the loop
will redefine the data structure, meaning that the work is being rewritten.
The dataframe should be defined outside of the loop if you do not want it to be overwritten.

Extracting tables from web

I need to extract all tables from this web:(only the second column)
https://zh.wikipedia.org/wiki/上海证券交易所上市公司列表
Well, the last three tables I don't need it...
However, my code only extract the second column from the first table.
import pickle
import requests
def save_china_tickers():
resp = requests.get('https://zh.wikipedia.org/wiki/上海证券交易所上市公司列表')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class':'wikitable'})
tickers=[]
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[1].text
tickers.append(ticker)
with open('chinatickers.pickle','wb') as f:
pickle.dump(tickers,f)
return tickers save_china_tickers()
I have an easy method.
Get HTTP Response
Find all tables using RegEx
Parse HTML Table to list of lists
Iterate the over each list in list
Requirements
dashtable
Code
from urllib.request import urlopen
from dashtable import html2data # to convert html table to list of list
import re
url = "https://zh.wikipedia.org/wiki/%E4%B8%8A%E6%B5%B7%E8%AF%81%E5%88%B8%E4%BA%A4%E6%98%93%E6%89%80%E4%B8%8A%E5%B8%82%E5%85%AC%E5%8F%B8%E5%88%97%E8%A1%A8"
# Reading http content
data = urlopen(url).read().decode()
# now fetching all tables with the help of regex
tables = ["<table>{}</table>".format(table) for table in re.findall(r"<table .*?>(.*?)</table>", data, re.M|re.S|re.I)]
# parsing data
parsed_tables = [html2data(table)[0] for table in tables] # html2data returns a tuple with 0th index as list of lists
# lets take first table ie 600000-600099
parsed = parsed_tables[0]
# column names of first table
print(parsed[0])
# rows of first table 2nd column
for index in range(1, len(parsed)):
print(parsed[index][1])
"""
Output: All the rows of table 1, column 2 excluding the headers
"""

Trouble parsing HTML page with Python

I'm trying to get a hold of the data under the columns having the code "SEVNYXX", where "XX" are the numbers that follow (eg. 01, 02, etc) on the site http://www.federalreserve.gov/econresdata/researchdata/feds200628_1.html using Python. I am currently using the following method as prescribed by the site http://docs.python-guide.org/en/latest/scenarios/scrape/ . However, I don't know how to determine the divs for this page and am hence unable to proceed and was hoping to get some help with this.
This is what I have so far:
from lxml import html
import requests
page = requests.get('http://www.federalreserve.gov/econresdata/researchdata/feds200628_1.html')
tree = html.fromstring(page.text)
Thank You
Have you tried using BeautifulSoup? I'm a pretty big fan. Using that you can easily iterate through all of the info you want, searching by tag.
Here's something I threw together, that prints out the values in each column you are looking at. Not sure what you want to do with the data, but hopefully it helps.
from bs4 import BeautifulSoup
from urllib import request
page = request.urlopen('http://www.federalreserve.gov/econresdata/researchdata/feds200628_1.html').read()
soup = BeautifulSoup(page)
desired_table = soup.findAll('table')[2]
# Find the columns you want data from
headers = desired_table.findAll('th')
desired_columns = []
for th in headers:
if 'SVENY' in th.string:
desired_columns.append(headers.index(th))
# Iterate through each row grabbing the data from the desired columns
rows = desired_table.findAll('tr')
for row in rows[1:]:
cells= row.findAll('td')
for column in desired_columns:
print(cells[column].text)
In response to your second request:
from bs4 import BeautifulSoup
from urllib import request
page = request.urlopen('http://www.federalreserve.gov/econresdata/researchdata/feds200628_1.html').read()
soup = BeautifulSoup(page)
desired_table = soup.findAll('table')[2]
data = {}
# Find the columns you want data from
headers = desired_table.findAll('th')
desired_columns = []
column_count = 0
for th in headers:
if 'SVENY' in th.string:
data[th.string] = {'column': headers.index(th), 'data': []}
column_count += 1
# Iterate through each row grabbing the data from the desired columns
rows = desired_table.findAll('tr')
for row in rows[1:]:
date = row.findAll('th')[0].text
cells= row.findAll('td')
for header,info in data.items():
column_number = info['column']
cell_data = [date,cells[column_number].text]
info['data'].append(cell_data)
This returns a dictionary where each key is the header for a column, and each value is another dictionary that has 1) the column it's in on the site, and 2) the actual data you want, in a list of lists.
As an example:
for year_number in data['SVENY01']['data']:
print(year_number)
['2015-06-05', '0.3487']
['2015-06-04', '0.3124']
['2015-06-03', '0.3238']
['2015-06-02', '0.3040']
['2015-06-01', '0.3009']
['2015-05-29', '0.2957']
etc.
You can fiddle around with this to get the info how and where you want it, but hopefully this is helpful.

Categories