Scraping results displayed across multiple pages from a `.aspx` site - python

I'm trying to scrape this site:
website address
If I manually search for A, I see the results spread across multiple pages but when I try to fetch the results using my script below, I get the results from the first page repeatedly:
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'http://www.occeweb.com/MOEAsearch/index.aspx'
session = requests.Session()
r = session.get(url)
soup = BeautifulSoup(r.text,'lxml')
for page in range(1,3):
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['txtSearch'] = 'A'
payload['__EVENTTARGET'] = 'gvResults'
payload['__EVENTARGUMENT'] = f'Page${page}'
res = session.post(url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select("#gvResults tr")[1:2]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)
How can I get the results from other pages as well?

Your problem happens at below line
payload = {i['name']: i.get('value', '') for i in soup.select('input[name]')}
What happens when your are doing the 2nd page search, it sends an extra payload of btnSearch, which cause it to become a search operation instead of a next page operation
Well the fix is quite simple, below is the updated code
import requests
from bs4 import BeautifulSoup
url = 'http://www.occeweb.com/MOEAsearch/index.aspx'
session = requests.Session()
r = session.get(url)
soup = BeautifulSoup(r.text,'lxml')
for page in range(1,3):
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['txtSearch'] = 'A'
payload['__EVENTTARGET'] = 'gvResults'
payload['__EVENTARGUMENT'] = f'Page${page}'
if page > 1:
payload.pop('btnSearch')
res = session.post(url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select("#gvResults tr")[1:2]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)

Related

How to fetch href link from a website using BeautifulSoup

I am trying to get all the article links in a given website below.
However, my code does not print anything at all although I specified the class id and the path to it.
below is my code.
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://uynaa.wordpress.com/category/%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb/").read()
soup = BeautifulSoup(html, "lxml")
productDivs = soup.findAll('div', attrs={'class' : 'post type-post status-publish format-standard hentry category-56456384'})
for div in productDivs:
print(div.find('h2')[a]['href'])
How do I fetch all the links?
The links are loaded dynamically via JavaScript from external URL. You can use this example to print all links:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
for p in data['postflair']:
print(p)
if data['lastbatch']:
break
page += 1
Prints:
https://uynaa.wordpress.com/2014/01/02/2013-in-review/
https://uynaa.wordpress.com/2013/10/07/%d0%b0%d1%84%d0%b3%d0%b0%d0%bd%d0%b8%d1%81%d1%82%d0%b0%d0%bd-%d0%b0%d0%bd%d1%85%d0%b4%d0%b0%d0%b3%d1%87-%d1%88%d0%b0%d0%bb%d1%82%d0%b3%d0%b0%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d0%b5-%d0%ba%d0%b0%d1%81%d0%bf%d0%b5%d1%80%d1%81%d0%ba%d0%b8%d0%b9-%d0%b1%d0%b8-%d0%b4%d0%b0%d1%80%d0%b0%d0%bd%d0%b3%d1%83%d0%b9%d0%bb%d0%b0%d0%bb-%d1%82%d0%be%d0%b3%d1%82%d0%be%d0%be-%d0%b3%d1%8d/
https://uynaa.wordpress.com/2013/10/07/%d1%88%d0%b0%d0%bd%d1%85%d0%b0%d0%b9-%d0%bd%d0%be%d0%b3%d0%be%d0%be%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d1%8d%d0%bd%d1%8d-%d0%b3%d0%b0%d0%b7%d0%b0%d1%80-%d0%bc%d0%b0%d0%bd%d0%b0%d0%b9%d1%85-%d0%b1%d0%b0%d0%b9%d1%81%d0%b0%d0%bd-%d1%8e%d0%bc/
https://uynaa.wordpress.com/2013/10/07/500-%d0%b6%d0%b8%d0%bb-%d0%b0%d1%80%d1%87%d0%bb%d1%83%d1%83%d0%bb%d0%b0%d0%b0%d0%b3%d2%af%d0%b9-%d0%b4%d1%8d%d0%bb%d1%85%d0%b8%d0%b9%d0%bd-%d1%86%d0%be%d1%80%d1%8b%d0%bd-%d0%b3%d0%b0%d0%bd%d1%86/
https://uynaa.wordpress.com/2013/02/01/%d1%83%d0%bb%d0%b7-%d0%bd%d1%83%d1%82%d0%b3%d0%b8%d0%b9%d0%bd-%d0%bf%d0%b8%d1%84%d0%b0%d0%b3%d0%be%d1%80/
https://uynaa.wordpress.com/2013/01/21/%d1%82%d0%b5%d0%bb%d0%b5%d0%b2%d0%b8%d0%b7%d0%b8%d0%b9%d0%bd-%d1%82%d2%af%d2%af%d1%85%d1%8d%d0%bd-%d0%b4%d1%8d%d1%85-%d1%85%d0%b0%d0%bc%d0%b3%d0%b8%d0%b9%d0%bd-%d0%b3%d0%b0%d0%b6%d0%b8%d0%b3-%d1%88/
https://uynaa.wordpress.com/2013/01/18/%d0%b0%d0%bf%d0%be%d1%84%d0%b8%d1%81-%d0%be%d0%be%d1%81-%d2%af%d2%af%d0%b4%d1%8d%d0%bd-%d3%a9%d1%80%d0%bd%d3%a9%d1%85-%d0%b6%d2%af%d0%b6%d0%b8%d0%b3/
https://uynaa.wordpress.com/2013/01/17/%d0%b0%d1%80%d0%b8%d1%83%d0%bd%d1%82%d0%bd%d1%8b-%d0%bd%d1%83%d1%82%d0%b0%d0%b3-%d0%b8%d0%b9%d0%b3-%d1%8d%d0%b7%d1%8d%d0%b3%d0%bd%d1%8d%d1%85-%d1%85%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/01/15/%d1%81%d0%b0%d1%83%d0%b4%d1%8b%d0%bd-%d1%82%d0%b0%d0%b3%d0%bd%d1%83%d1%83%d0%bb%d1%87%d0%b8%d0%b4-%d0%b0%d1%81%d0%b0%d0%b4%d1%8b%d0%b3-%d0%be%d0%bb%d0%b6%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/15/%d0%bc%d0%b0%d0%bb%d0%b8%d0%b3%d1%8d%d1%8d%d1%81-%d1%81%d0%be%d0%bc%d0%b0%d0%bb%d0%b8-%d1%85%d2%af%d1%80%d1%82%d1%8d%d0%bb/
https://uynaa.wordpress.com/2013/01/10/%d1%85%d0%be%d1%80%d0%b2%d0%be%d0%be-%d0%b5%d1%80%d1%82%d3%a9%d0%bd%d1%86-%d1%85%d0%b0%d0%bb%d0%b0%d0%b0%d1%81%d0%b0%d0%bd%d0%b4-%d0%b1%d0%b0%d0%b3%d1%82%d0%b0%d0%bd%d0%b0/
https://uynaa.wordpress.com/2013/01/10/%d1%82%d0%b0%d0%bd%d0%b3%d0%b0%d1%80%d0%b0%d0%b3-%d3%a9%d1%80%d0%b3%d3%a9%d1%85-%d1%91%d1%81%d0%bb%d0%be%d0%bb-%d1%85%d2%af%d0%bb%d1%8d%d1%8d%d0%b6-%d0%b1%d0%b0%d0%b9%d0%b3-%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/09/%d0%b1%d0%be%d0%bb%d0%bb%d0%b8%d0%b2%d1%83%d0%b4%d1%8b%d0%bd-%d0%ba%d0%b8%d0%bd%d0%be%d0%bd%d0%be%d0%be%d1%81-%d1%87-%d0%b0%d0%b9%d0%bc%d0%b0%d0%b0%d1%80/
https://uynaa.wordpress.com/2013/01/08/%d0%bf%d0%b5%d0%bd%d1%82%d0%b0%d0%b3%d0%be%d0%bd-%d0%b1%d0%be%d0%bb%d0%be%d0%bd-%d1%82%d1%82%d0%b3-%d1%8b%d0%b3-%d1%83%d0%b4%d0%b8%d1%80%d0%b4%d0%b0%d1%85-%d0%bc%d0%b0%d0%b3%d0%b0%d0%b4%d0%bb%d0%b0/
https://uynaa.wordpress.com/2013/01/07/%d0%b7%d0%b8%d0%b0%d0%b4-%d1%82%d0%b0%d0%ba%d0%b8%d0%b5%d0%b4%d0%b4%d0%b8%d0%bd/
...and so on.
EDIT: To filter the links only to specified category, you can use this script:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
all_links = []
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
soup = BeautifulSoup(data['html'], 'html.parser')
for p in soup.select('.post'):
if any('%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb' in cat['href'] for cat in p.select('[rel="category tag"]')):
if p.h2.a['href'] not in all_links:
print(p.h2.a['href'])
all_links.append(p.h2.a['href'])
if data['lastbatch']:
break
page += 1
print(len(all_links))
Prints 135 links:
...
https://uynaa.wordpress.com/2011/05/13/%e2%80%9c%d1%83%d1%85%d0%b0%d0%b0%d0%bd-%d0%bc%d1%83%d1%83%d1%82%d0%bd%d1%83%d1%83%d0%b4%d1%8b%d0%bd-%d2%af%d0%b5%e2%80%9d/
https://uynaa.wordpress.com/2011/05/04/%d2%af%d1%85%d0%bb%d0%b8%d0%b9%d0%bd-%d1%82%d0%be%d0%b3%d0%bb%d0%be%d0%be%d0%bc/
https://uynaa.wordpress.com/2011/05/04/%d0%be%d1%81%d0%b0%d0%bc%d0%b0-%d0%b1%d0%b8%d0%bd-%d0%bb%d0%b0%d0%b4%d0%b5%d0%bd%d0%b8%d0%b9%d0%b3-%d1%8f%d0%b0%d0%b6-%d0%b8%d0%bb%d1%80%d2%af%d2%af%d0%bb%d1%81%d1%8d%d0%bd-%d0%b1%d1%8d/
135
Not sure why your codes don't work. For me, I used the below codes to get all the links first.
list_href = []
a_tags = soup.find_all('a')
for tag in a_tags:
list_href.append(tag.get('href'))
The links of the articles are in list_href[5:26].

How to print all results of Beautiful Soup at once?

I have a list of twitter usernames. I need to get their number of followers. I used BS and requests. However, I've only received one account every time.
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv', engine= 'python')
followers = purcsv['username']
followers.head(10)
handle = purcsv['username'][0:40]
temp = ("https://twitter.com/"+handle)
temp = temp.tolist()
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text,'lxml')
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
That's because you are looping over the urls first and fetching the content for each in the same variable page here:
for url in temp:
page = requests.get(url)
so page will always contain the last url page accessed, to solve this you need to process a page once fetched
followers_list = []
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
here is a full example to verify
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv')
followers = purcsv['username']
handles = purcsv['username'][0:40].tolist()
followers_list = []
for handle in handles:
url = "https://twitter.com/" + handle
try:
page = requests.get(url)
except Exception as e:
print(f"Failed to fetch page for url {url} due to: {e}")
continue
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
output:
Number of followers: 13714085
Number of followers: 4706511
['13714085', '4706511']
You may consider using async function for fetching and processing those urls if you have two many of them.

Can't parse different product links from a webpage

I've created a script in Python to fetch different product links from a webpage. Although I know the content of that site are dynamic, I tried conventional way to let you inform that I tried. I looked for APIs in the dev tools but could not find one. Ain't there any way to get those links using requests?
Site Link
I've written so far:
import requests
from bs4 import BeautifulSoup
link = "https://www.amazon.com/stores/node/10699640011"
def fetch_product_links(url):
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for item_link in soup.select("[id^='ProductGrid-'] li[class^='style__itemOuter__'] > a"):
print(item_link.get("href"))
if __name__ == '__main__':
fetch_product_links(link)
How can I fetch different product links from that site using requests?
I think you only need the asins which you can collect from another url construct you can see in network tab i.e. you can significantly shorten the final urls. You do however need to make a request to your original url to pick up an identifier to use in second url. Returns 146 links.
import requests, re, json
node = '10699640011'
with requests.Session() as s:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
p = re.compile(r'var slotsStr = "\[(.*?,){3} share\]";')
identifier = p.findall(r.text)[0]
identifier = identifier.strip()[:-1]
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)
EDIT:
With two given nodes:
import requests, re, json
from bs4 import BeautifulSoup as bs
nodes = ['3039806011','10699640011']
with requests.Session() as s:
for node in nodes:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
soup = bs(r.content, 'lxml')
identifier = soup.select('.stores-widget-btf:not([id=share],[id*=RECOMMENDATION])')[-1]['id']
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)

Problem sending data through post request in Python

I am trying to input a decision start and end date into 2 input boxes on the Gosport Council website by sending a post request. Whenever I print out the text received from after I send the request it gives me the info shown on the input page, not the loaded page
import requests
payload = {
"applicationDecisionStart": "1/8/2018",
"applicationDecisionEnd": "1/10/2018",
}
with requests.Session() as session:
r = session.get("https://publicaccess.gosport.gov.uk/online-applications/search.do?action=advanced", timeout=10, data=payload)
print(r.text)
If I execute it I want it to print out the HTML with the href links for example
<a href="/online-applications/applicationDetails.do?keyVal=PEA12JHO07E00&activeTab=summary">
But my code won't show anything like this
I observe the POST, not GET which you are doing, is as follows (ignoring empty fields in POST):
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
## later pages
#https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page=2
Loop over pages:
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
pages = int(soup.select('span + a.page')[-1].text)
for page in range(2, pages + 1):
r = s.get('https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page={}'.format(page))
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
the url and data is incorrect
use Chrome to analysis the response
press f12 to open Developer tools,change to item "network".then submit your page,analysis the first request initiated by Chrome.
what you need:
Hearders-general-request url
Hearders-request headers
Hearders-data
you need some packages to parser th html, such as bs4

Scraping and parsing multi-page (aspx) table

I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)

Categories