I am trying to input a decision start and end date into 2 input boxes on the Gosport Council website by sending a post request. Whenever I print out the text received from after I send the request it gives me the info shown on the input page, not the loaded page
import requests
payload = {
"applicationDecisionStart": "1/8/2018",
"applicationDecisionEnd": "1/10/2018",
}
with requests.Session() as session:
r = session.get("https://publicaccess.gosport.gov.uk/online-applications/search.do?action=advanced", timeout=10, data=payload)
print(r.text)
If I execute it I want it to print out the HTML with the href links for example
<a href="/online-applications/applicationDetails.do?keyVal=PEA12JHO07E00&activeTab=summary">
But my code won't show anything like this
I observe the POST, not GET which you are doing, is as follows (ignoring empty fields in POST):
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
## later pages
#https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page=2
Loop over pages:
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
pages = int(soup.select('span + a.page')[-1].text)
for page in range(2, pages + 1):
r = s.get('https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page={}'.format(page))
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
the url and data is incorrect
use Chrome to analysis the response
press f12 to open Developer tools,change to item "network".then submit your page,analysis the first request initiated by Chrome.
what you need:
Hearders-general-request url
Hearders-request headers
Hearders-data
you need some packages to parser th html, such as bs4
Related
The code can not get the next page, it only repeats in an infinite loop. I am using the example from oxylabs
Could you tell me what I'm doing wrong? Thank you.
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
url = 'https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html'
while True:
response = requests.get(url)
soup = bs(response.content, "lxml")
symbols = soup.find_all('td', class_='STOCK_CODE' )
for s in symbols:
symbol = s.find('a').text
print(symbol)
next_page = soup.select_one('span', id = 'next')
if next_page:
next_url = next_page.get('href')
url = urljoin(url, next_url)
else:
break
print(url)
The information you want for the other pages is being returned via another call. You need to recreate that call (use your browser's network tools to see what is happening).
The request requires a token that is returned when the homepage is requested. This needs to be provided when requesting the other pages.
For example:
from bs4 import BeautifulSoup as bs
import requests
session = requests.Session()
req_homepage = session.get('https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html')
soup_homepage = bs(req_homepage.content, "lxml")
for meta in soup_homepage.find_all('meta'):
if meta.get('name', None) == '__RequestVerificationToken':
token = meta['content']
data = {
"p_issearch" : 0,
"p_keysearch" : "",
"p_market_code" : "",
"p_orderby" : "STOCK_CODE",
"p_ordertype" : "ASC",
"p_currentpage" : 2,
"p_record_on_page" : 10,
}
headers = {
"Referer" : "https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html",
"__RequestVerificationToken" : token,
"X-Requested-With" : "XMLHttpRequest",
}
for page in range(1, 4):
print(f"Page {page}")
data['p_currentpage'] = page
req = session.post('https://hnx.vn/ModuleIssuer/List/ListSearch_Datas', data=data, headers=headers)
json_content = req.json()['Content']
soup = bs(json_content, "lxml")
for td in soup.find_all('td', class_='STOCK_CODE'):
symbol = td.find('a').text
print(' ', symbol)
This would give you the following output:
Page 1
AAV
ACM
ADC
ALT
AMC
AME
AMV
API
APP
APS
Page 2
ARM
ART
ATS
BAB
BAX
BBS
BCC
BCF
BDB
BED
Page 3
BII
BKC
BLF
BNA
BPC
BSC
BST
BTS
BTW
BVS
I am trying to get all the article links in a given website below.
However, my code does not print anything at all although I specified the class id and the path to it.
below is my code.
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://uynaa.wordpress.com/category/%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb/").read()
soup = BeautifulSoup(html, "lxml")
productDivs = soup.findAll('div', attrs={'class' : 'post type-post status-publish format-standard hentry category-56456384'})
for div in productDivs:
print(div.find('h2')[a]['href'])
How do I fetch all the links?
The links are loaded dynamically via JavaScript from external URL. You can use this example to print all links:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
for p in data['postflair']:
print(p)
if data['lastbatch']:
break
page += 1
Prints:
https://uynaa.wordpress.com/2014/01/02/2013-in-review/
https://uynaa.wordpress.com/2013/10/07/%d0%b0%d1%84%d0%b3%d0%b0%d0%bd%d0%b8%d1%81%d1%82%d0%b0%d0%bd-%d0%b0%d0%bd%d1%85%d0%b4%d0%b0%d0%b3%d1%87-%d1%88%d0%b0%d0%bb%d1%82%d0%b3%d0%b0%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d0%b5-%d0%ba%d0%b0%d1%81%d0%bf%d0%b5%d1%80%d1%81%d0%ba%d0%b8%d0%b9-%d0%b1%d0%b8-%d0%b4%d0%b0%d1%80%d0%b0%d0%bd%d0%b3%d1%83%d0%b9%d0%bb%d0%b0%d0%bb-%d1%82%d0%be%d0%b3%d1%82%d0%be%d0%be-%d0%b3%d1%8d/
https://uynaa.wordpress.com/2013/10/07/%d1%88%d0%b0%d0%bd%d1%85%d0%b0%d0%b9-%d0%bd%d0%be%d0%b3%d0%be%d0%be%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d1%8d%d0%bd%d1%8d-%d0%b3%d0%b0%d0%b7%d0%b0%d1%80-%d0%bc%d0%b0%d0%bd%d0%b0%d0%b9%d1%85-%d0%b1%d0%b0%d0%b9%d1%81%d0%b0%d0%bd-%d1%8e%d0%bc/
https://uynaa.wordpress.com/2013/10/07/500-%d0%b6%d0%b8%d0%bb-%d0%b0%d1%80%d1%87%d0%bb%d1%83%d1%83%d0%bb%d0%b0%d0%b0%d0%b3%d2%af%d0%b9-%d0%b4%d1%8d%d0%bb%d1%85%d0%b8%d0%b9%d0%bd-%d1%86%d0%be%d1%80%d1%8b%d0%bd-%d0%b3%d0%b0%d0%bd%d1%86/
https://uynaa.wordpress.com/2013/02/01/%d1%83%d0%bb%d0%b7-%d0%bd%d1%83%d1%82%d0%b3%d0%b8%d0%b9%d0%bd-%d0%bf%d0%b8%d1%84%d0%b0%d0%b3%d0%be%d1%80/
https://uynaa.wordpress.com/2013/01/21/%d1%82%d0%b5%d0%bb%d0%b5%d0%b2%d0%b8%d0%b7%d0%b8%d0%b9%d0%bd-%d1%82%d2%af%d2%af%d1%85%d1%8d%d0%bd-%d0%b4%d1%8d%d1%85-%d1%85%d0%b0%d0%bc%d0%b3%d0%b8%d0%b9%d0%bd-%d0%b3%d0%b0%d0%b6%d0%b8%d0%b3-%d1%88/
https://uynaa.wordpress.com/2013/01/18/%d0%b0%d0%bf%d0%be%d1%84%d0%b8%d1%81-%d0%be%d0%be%d1%81-%d2%af%d2%af%d0%b4%d1%8d%d0%bd-%d3%a9%d1%80%d0%bd%d3%a9%d1%85-%d0%b6%d2%af%d0%b6%d0%b8%d0%b3/
https://uynaa.wordpress.com/2013/01/17/%d0%b0%d1%80%d0%b8%d1%83%d0%bd%d1%82%d0%bd%d1%8b-%d0%bd%d1%83%d1%82%d0%b0%d0%b3-%d0%b8%d0%b9%d0%b3-%d1%8d%d0%b7%d1%8d%d0%b3%d0%bd%d1%8d%d1%85-%d1%85%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/01/15/%d1%81%d0%b0%d1%83%d0%b4%d1%8b%d0%bd-%d1%82%d0%b0%d0%b3%d0%bd%d1%83%d1%83%d0%bb%d1%87%d0%b8%d0%b4-%d0%b0%d1%81%d0%b0%d0%b4%d1%8b%d0%b3-%d0%be%d0%bb%d0%b6%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/15/%d0%bc%d0%b0%d0%bb%d0%b8%d0%b3%d1%8d%d1%8d%d1%81-%d1%81%d0%be%d0%bc%d0%b0%d0%bb%d0%b8-%d1%85%d2%af%d1%80%d1%82%d1%8d%d0%bb/
https://uynaa.wordpress.com/2013/01/10/%d1%85%d0%be%d1%80%d0%b2%d0%be%d0%be-%d0%b5%d1%80%d1%82%d3%a9%d0%bd%d1%86-%d1%85%d0%b0%d0%bb%d0%b0%d0%b0%d1%81%d0%b0%d0%bd%d0%b4-%d0%b1%d0%b0%d0%b3%d1%82%d0%b0%d0%bd%d0%b0/
https://uynaa.wordpress.com/2013/01/10/%d1%82%d0%b0%d0%bd%d0%b3%d0%b0%d1%80%d0%b0%d0%b3-%d3%a9%d1%80%d0%b3%d3%a9%d1%85-%d1%91%d1%81%d0%bb%d0%be%d0%bb-%d1%85%d2%af%d0%bb%d1%8d%d1%8d%d0%b6-%d0%b1%d0%b0%d0%b9%d0%b3-%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/09/%d0%b1%d0%be%d0%bb%d0%bb%d0%b8%d0%b2%d1%83%d0%b4%d1%8b%d0%bd-%d0%ba%d0%b8%d0%bd%d0%be%d0%bd%d0%be%d0%be%d1%81-%d1%87-%d0%b0%d0%b9%d0%bc%d0%b0%d0%b0%d1%80/
https://uynaa.wordpress.com/2013/01/08/%d0%bf%d0%b5%d0%bd%d1%82%d0%b0%d0%b3%d0%be%d0%bd-%d0%b1%d0%be%d0%bb%d0%be%d0%bd-%d1%82%d1%82%d0%b3-%d1%8b%d0%b3-%d1%83%d0%b4%d0%b8%d1%80%d0%b4%d0%b0%d1%85-%d0%bc%d0%b0%d0%b3%d0%b0%d0%b4%d0%bb%d0%b0/
https://uynaa.wordpress.com/2013/01/07/%d0%b7%d0%b8%d0%b0%d0%b4-%d1%82%d0%b0%d0%ba%d0%b8%d0%b5%d0%b4%d0%b4%d0%b8%d0%bd/
...and so on.
EDIT: To filter the links only to specified category, you can use this script:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
all_links = []
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
soup = BeautifulSoup(data['html'], 'html.parser')
for p in soup.select('.post'):
if any('%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb' in cat['href'] for cat in p.select('[rel="category tag"]')):
if p.h2.a['href'] not in all_links:
print(p.h2.a['href'])
all_links.append(p.h2.a['href'])
if data['lastbatch']:
break
page += 1
print(len(all_links))
Prints 135 links:
...
https://uynaa.wordpress.com/2011/05/13/%e2%80%9c%d1%83%d1%85%d0%b0%d0%b0%d0%bd-%d0%bc%d1%83%d1%83%d1%82%d0%bd%d1%83%d1%83%d0%b4%d1%8b%d0%bd-%d2%af%d0%b5%e2%80%9d/
https://uynaa.wordpress.com/2011/05/04/%d2%af%d1%85%d0%bb%d0%b8%d0%b9%d0%bd-%d1%82%d0%be%d0%b3%d0%bb%d0%be%d0%be%d0%bc/
https://uynaa.wordpress.com/2011/05/04/%d0%be%d1%81%d0%b0%d0%bc%d0%b0-%d0%b1%d0%b8%d0%bd-%d0%bb%d0%b0%d0%b4%d0%b5%d0%bd%d0%b8%d0%b9%d0%b3-%d1%8f%d0%b0%d0%b6-%d0%b8%d0%bb%d1%80%d2%af%d2%af%d0%bb%d1%81%d1%8d%d0%bd-%d0%b1%d1%8d/
135
Not sure why your codes don't work. For me, I used the below codes to get all the links first.
list_href = []
a_tags = soup.find_all('a')
for tag in a_tags:
list_href.append(tag.get('href'))
The links of the articles are in list_href[5:26].
I'm trying to scrape this site:
website address
If I manually search for A, I see the results spread across multiple pages but when I try to fetch the results using my script below, I get the results from the first page repeatedly:
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'http://www.occeweb.com/MOEAsearch/index.aspx'
session = requests.Session()
r = session.get(url)
soup = BeautifulSoup(r.text,'lxml')
for page in range(1,3):
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['txtSearch'] = 'A'
payload['__EVENTTARGET'] = 'gvResults'
payload['__EVENTARGUMENT'] = f'Page${page}'
res = session.post(url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select("#gvResults tr")[1:2]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)
How can I get the results from other pages as well?
Your problem happens at below line
payload = {i['name']: i.get('value', '') for i in soup.select('input[name]')}
What happens when your are doing the 2nd page search, it sends an extra payload of btnSearch, which cause it to become a search operation instead of a next page operation
Well the fix is quite simple, below is the updated code
import requests
from bs4 import BeautifulSoup
url = 'http://www.occeweb.com/MOEAsearch/index.aspx'
session = requests.Session()
r = session.get(url)
soup = BeautifulSoup(r.text,'lxml')
for page in range(1,3):
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['txtSearch'] = 'A'
payload['__EVENTTARGET'] = 'gvResults'
payload['__EVENTARGUMENT'] = f'Page${page}'
if page > 1:
payload.pop('btnSearch')
res = session.post(url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select("#gvResults tr")[1:2]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)
import requests
from bs4 import BeautifulSoup
try:
for count in range(123401,123405):
ctl00_RightContetHolder_TextBox1 = count
r = requests.post('http://karnatakamedicalcouncil.com/RenewalReport.aspx',
data={'ctl00_RightContetHolder_TextBox1': count, 'Search': "submit"})
soup = BeautifulSoup(r.text, 'html.parser')
for i in soup.find('table', {'class': 'mGrid'}):
for links in i.find('a',class_='Viewdetails'):
print links
except:
pass
I am trying to get each of the links in the mGrid tables, but haven't been able to retrieve them with beautiful soup. I don't understand why the anchor tags are not being found, or if they are being found why they are not being printed. Please help me.
It missing required data __VIEWSTATE and __EVENTVALIDATION, to get it you need create GET request and extract hidden input value with that ID then you can create POST or search request with that data.
url = 'http://karnatakamedicalcouncil.com/RenewalReport.aspx'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
VIEWSTATE = soup.find(id='__VIEWSTATE')['value']
EVENTVALIDATION = soup.find(id='__EVENTVALIDATION')['value']
for count in range(123401,123405):
data = {
'__VIEWSTATE' : VIEWSTATE,
'__VIEWSTATEENCRYPTED' : '',
'__EVENTVALIDATION' : EVENTVALIDATION,
'ctl00$RightContetHolder$TextBox1': count,
'ctl00$RightContetHolder$hdnSearch': "Search",
}
r = requests.post(url, data=data)
soup = BeautifulSoup(r.text, 'html.parser')
for links in soup.findAll('a', class_='Viewdetails'):
print links['href']
I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)