Scraping multiple pages in one Beautiful Soup script -- getting same result - python

I'm trying to loop through a script that parses tables with Beautiful Soup in Python 2.7.
The first table parse works and produces the expected results. The second loop produces exactly the same results as the first loop.
Additional details:
If I manually use the url that the second loop used to parse, I get
the intended page that I want to scrape. There is a little delay in refresh.
I use this on other websites and the loop works as intended.
Here is the script:
import urllib2
import csv
from bs4 import BeautifulSoup # latest version bs4
week = raw_input("Which week?")
week = str(week)
data = []
first = "http://fantasy.nfl.com/research/projections#researchProjections=researchProjections%2C%2Fresearch%2Fprojections%253Foffset%253D"
middle = "%2526position%253DO%2526sort%253DprojectedPts%2526statCategory%253DprojectedStats%2526statSeason%253D2015%2526statType%253DweekProjectedStats%2526statWeek%253D"
last = "%2Creplace"
page_num = 1
for page_num in range(1,3):
page_mult = (page_num-1) * 25 +1
next = str(page_mult)
url = first + next + middle + week + last
print url #I added this in order to check my output
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class':'tableType-player hasGroups'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
b = open('NFLtable.csv', 'w')
a = csv.writer(b)
a.writerows(data)
b.close()
page_num =page_num+1
print data

On the actual page they are using AJAX to request additional results, with a JSON response with some HTML as one of the values.
I modified your code a bit, give it a try:
import urllib2
import urllib
import csv
from bs4 import BeautifulSoup # latest version bs4
import json
week = raw_input("Which week?")
week = str(week)
data = []
url_format = "http://fantasy.nfl.com/research/projections?offset={offset}&position=O&sort=projectedPts&statCategory=projectedStats&statSeason=2015&statType=weekProjectedStats&statWeek={week}"
for page_num in range(1, 3):
page_mult = (page_num - 1) * 25 + 1
next = str(page_mult)
url = url_format.format(week=week, offset=page_mult)
print url # I added this in order to check my output
request = urllib2.Request(url, headers={'Ajax-Request': 'researchProjections'})
raw_json = urllib2.urlopen(request).read()
parsed_json = json.loads(raw_json)
html = parsed_json['content']
soup = BeautifulSoup(html, "html.parser")
table = soup.find('table', attrs={'class': 'tableType-player hasGroups'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
print data
I tested with week=4.

Related

i have tried to scrape a table using beautifulsoup and only one line of the table appears as output

I have tried to scrape the table http://www.geonames.org/search.html?q=kwadukuza&country=ZA, however only the last line of the table appears
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)=
df = pd.DataFrame(columns = headers)
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
you can use seperate list to append row data to make list of list data and then use it as row for your df
all_rows=[]
for j in table_data.find_all('tr',class_="odd"):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
all_rows.append(row)
For DataFrame:
df = pd.DataFrame(columns = headers,data=all_rows)
Output:
df.shape
(25,6)
As the comment already says, you need to put the row = [tr.text.strip() for tr in row_data] in the for loop. Otherwise you would just get the last entry.
In order to add the rows to the DataFrame, you need to make a list of all rows and put it together with the headers to a DataFrame. You could also append the rows to the DataFrame, but it is less efficient
Solution
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)
data = []
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data] # Put into the for loop
data.append(row)
# DataFrame
df = pd.DataFrame(columns=headers, data=data)
print(df)

scraping select all checkbox using python

import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.find('table', {'class' : 'table'})
rows = table.find_all('th')
headers = []
for i in table.find_all('th'):
title = i.text
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
print(df)
I need to scrape a table from a website but it has select all checkbox for each row .What should I do.
Any help will be appreciated thank you.
(If I understand your question correctly: you want to remove the checkboxes from the output of the table).
Since the checkboxes are the first index of the table, you can skip them using index slicing. Use: [1:], which means: "from the first index to the last" (skipping the zero-based index).
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = (
"https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
table = soup.find("table", {"class": "table"})
rows = table.find_all("th")
headers = []
for i in table.find_all("th"):
title = i.text.strip()
headers.append(title)
rows = []
for row in table.find_all("tr")[1:]:
data = row.find_all("td")
rows.append(td.text.strip() for td in data[1:])
df = pd.DataFrame(rows[:-1], columns=headers[1:])
print(df)
Output:
Scrip P.Close Open High Low LTP # REAL LTP(NOW) Result
0 BRITANNIA 3379.10 3385.00 3447.00 3385.00 3439.50 3439.50 0
1 EICHERMOT 2551.20 2565.00 2634.00 2565.00 2625.05 2625.05 0
You don't need to check those boxes in order to return all rows.
You can grab the table with pandas and drop the first column by name (if desired).
You can also do some tidying to match the web page.
import pandas as pd
df = pd.read_html('https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha')[0]
df.drop(columns={'Sr.No.'}, inplace=True)
df.iloc[-1, 0:4] = ''
df.fillna(0, inplace=True)
df

Scraping table (several pages) to Pandas Dataframe

I'm trying to transfer the data of a long table (24 pages) to a Pandas Dataframe, but facing some issues with (i think) the for-loop code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
res = requests.get(base_url.format('1'))
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
The Dataframe comes out as a repetition of the first page only, rather than a copy of all the data in the table.
I agree with #mxbi.
Try it:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
requests is needed because the server wants an user-agent header and pandas read_html doesn't allow for that. As you still want to use pandas to generate the dataframe you could gain some efficiency through using multiprocessing to handle the requests, and within an user defined function extract the table of interest and pass as string to read_html. You will get a list of dataframes which can be combined with pandas concat.
Note: This can't be run from within Jupyter as will block.
import pandas as pd
from multiprocessing import Pool, cpu_count
import requests
from bs4 import BeautifulSoup as bs
def get_table(url:str)-> pd.DataFrame:
soup = bs(requests.get(url).text, 'lxml')
df = pd.read_html(str(soup.select_one('.table')))[0]
df['page_num'] = url.split("=")[-1]
return df
if __name__ == '__main__':
urls = [f'https://scrapethissite.com/pages/forms/?page_num={i}' for i in range(1, 25)]
with Pool(cpu_count()-1) as p:
results = p.map(get_table, urls)
final = pd.concat(results)
print(final)
# final.to_csv('data.csv', index = False, encoding = 'utf-8-sig')

Extract both text in urls from table in page - beautiful soup

I am trying to extract both text and urls in a table from a website but I only seem to be able to get the text. I am guessing this has something to do with the
text.strip in my code but I am not how I can clean up the html tags without removing the url links in there. Here's what I've put together so far:
import requests
from bs4 import BeautifulSoup
start_number = 0
max_number = 5
urls=[]
for number in range(start_number, max_number + start_number):
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=' + str(number)+ '&lang=en'
urls.append(url)
data = []
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
table = soup.find("table")
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
Simply extract the href from the <a> element. For the purpose of the answer, I simplified the code not to worry about subsequent pages.
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=0&lang=en'
data = []
Record = namedtuple('Record', 'id company agency date pdf_link')
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
rows = soup.select('table > tbody > tr')
for row in rows[1:]: # omit header row
cols = row.find_all('td')
fields = [td.text.strip() for td in cols if td.text.strip()]
if fields: # if the row is not empty
pdf_link = row.find('a')['href']
record = Record(*fields, pdf_link)
data.append(record)
>>> data[0].pdf_link
'images/notifikasi/619.%20Pengumuman%20Publik%20PT%20IGP.compressed.pdf'

BeautifulSoup html table scrape - will only return last row

I am attempting a simple scrape of an HTML table using BeautifulSoup with the following:
import urllib
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
page = urllib.request.urlopen(url)
sdata = BeautifulSoup(page, 'html.parser')
return sdata
url = 'http://www.satp.org/satporgtp/countries/pakistan/database/bombblast.htm'
soup = make_soup(url)
table = soup.findAll('table', attrs={'class':'pagraph1'})
table = table[0]
trows = table.findAll('tr')
bbdata_ = []
bbdata = []
for trow in trows:
bbdata_ = trow.findAll('td')
bbdata = [ele.text.strip() for ele in bbdata_]
print(bbdata)
However, I can only extract the last row in the table, i.e.
['Total*', '369', '1032+']
All of the data is included in the trows, so I must be forming my loop incorrectly, but I am not sure how.
Your problem is here:
bbdata = [ele.text.strip() for ele in bbdata_]
You want to append to the list or extend it:
bbdata.append([ele.text.strip() for ele in bbdata_])
You are overwriting bbdata each time through the loop which is why it ends up only with the final value.

Categories